In [21]:
# === Import Libraries ===
import pandas as pd
from sklearn.model_selection import cross_val_score, GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import string

# === Load Dataset ===
data_path = '/Users/omkarkalekar/Downloads/data_stories_one_shot.csv'
data = pd.read_csv(data_path)

# === Preprocess Text: Convert to lowercase and strip punctuation ===
def clean_text(text):
    return text.lower().translate(str.maketrans('', '', string.punctuation))

data['text_clean'] = data['Sentence'].apply(clean_text)

# === Generate Target Labels: 'Show' if Stage == 1, else 'Tell' ===
data['target'] = data['Stage'].apply(lambda val: 'Show' if val == 1 else 'Tell')
label_encoder = LabelEncoder()
data['target_encoded'] = label_encoder.fit_transform(data['target'])

# === Initialize Classifiers ===
classifier_dict = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(kernel='linear'),
    "Multinomial Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier()
}

# === Evaluate Each Model Using 5-Fold Cross-Validation ===
accuracy_scores = {}
for clf_name, clf in classifier_dict.items():
    model_pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('model', clf)
    ])
    cv_scores = cross_val_score(model_pipeline, data['text_clean'], data['target_encoded'], cv=5, scoring='accuracy')
    accuracy_scores[clf_name] = cv_scores.mean()

# === Leave-One-Group-Out Cross-Validation: Based on Plot_Name ===
group_validator = GroupKFold(n_splits=data['Plot_Name'].nunique())
logreg_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('model', LogisticRegression(max_iter=1000))
])

group_scores = cross_val_score(
    logreg_pipeline,
    data['text_clean'],
    data['target_encoded'],
    cv=group_validator.split(data['text_clean'], data['target_encoded'], groups=data['Plot_Name'])
)

# === Print Results ===
print("Accuracy from Standard 5-Fold Cross-Validation📊:")
for name, score in accuracy_scores.items():
    print(f"{name}: {score:.4f}")

print(f"\nAverage Accuracy from Leave-One-Plot-Out (LogReg only): {group_scores.mean():.4f}")


Accuracy from Standard 5-Fold Cross-Validation📊:
Logistic Regression: 0.6846
Support Vector Machine: 0.8385
Multinomial Naive Bayes: 0.7308
Random Forest: 0.6846

Average Accuracy from Leave-One-Plot-Out (LogReg only): 0.6796
