In [2]:
import pandas as pd

df = pd.read_csv("data_stories_one_shot.csv")  # No full path needed
print(df.head())


  Plot_Name  Stage  Quality                                           Sentence
0  walk dog      1      1.0              This is a line chart with error bars.
1  walk dog      1      1.0                     The chart title is 'Walk dog'.
2  walk dog      1      1.0              The y-axis represents 'Mean anxiety'.
3  walk dog      1      1.0  The x-axis indicates conditions such as 'Basel...
4  walk dog      1      1.0  The chart compares mean anxiety levels with an...


In [3]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import string

# Step 1: Load data (no path needed since it's uploaded)
df = pd.read_csv("data_stories_one_shot.csv")

# Step 2: Clean text (lowercase + remove punctuation)
df['processed'] = df['Sentence'].apply(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))

# Step 3: Encode labels ('Show' = 1, 'Tell' = 0)
df['label'] = df['Stage'].apply(lambda x: 'Show' if x == 1 else 'Tell')
df['label_encoded'] = LabelEncoder().fit_transform(df['label'])

# Step 4: Define ML models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel='linear'),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier()
}

# Step 5: 5-Fold Cross-Validation using TF-IDF
results_cv = {}
skf = StratifiedKFold(n_splits=5)

for name, model in models.items():
    pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('clf', model)])
    accuracies = []
    for train_idx, test_idx in skf.split(df['processed'], df['label_encoded']):
        X_train, X_test = df['processed'].iloc[train_idx], df['processed'].iloc[test_idx]
        y_train, y_test = df['label_encoded'].iloc[train_idx], df['label_encoded'].iloc[test_idx]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
    
    results_cv[name] = sum(accuracies) / len(accuracies)

# Step 6: Leave-One-Plot-Out Cross-Validation using Logistic Regression
scores_lopo = []
unique_plots = df['Plot_Name'].unique()

for plot in unique_plots:
    train_data = df[df['Plot_Name'] != plot]
    test_data = df[df['Plot_Name'] == plot]
    
    X_train, X_test = train_data['processed'], test_data['processed']
    y_train, y_test = train_data['label_encoded'], test_data['label_encoded']
    
    pipeline_lr = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression(max_iter=1000))])
    pipeline_lr.fit(X_train, y_train)
    y_pred = pipeline_lr.predict(X_test)
    
    scores_lopo.append(accuracy_score(y_test, y_pred))

# Step 7: Print results
print("📊 5-Fold Cross-Validation Accuracy:")
for model_name, acc in results_cv.items():
    print(f"{model_name}: {acc:.4f}")

print(f"\n🔁 Leave-One-Plot-Out Accuracy (Logistic Regression): {sum(scores_lopo) / len(scores_lopo):.4f}")


📊 5-Fold Cross-Validation Accuracy:
Logistic Regression: 0.6846
SVM: 0.8385
Naive Bayes: 0.7308
Random Forest: 0.6769

🔁 Leave-One-Plot-Out Accuracy (Logistic Regression): 0.6796
