In [3]:
import pandas as pd
import string
from sklearn.model_selection import cross_val_score, GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load data
data_path = "C:/Users/Siddhesh/Desktop/IS733/data_stories_one_shot.csv"
df = pd.read_csv(data_path)

# Text preprocessing: lowercase + remove punctuation
def clean_text(text):
    return text.lower().translate(str.maketrans('', '', string.punctuation))

df['text'] = df['Sentence'].apply(clean_text)

# Encode target: 'Show' = 1, 'Tell' = 0
df['target'] = df['Stage'].apply(lambda x: 'Show' if x == 1 else 'Tell')
label_encoder = LabelEncoder()
df['target_encoded'] = label_encoder.fit_transform(df['target'])

# Define models and names
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('SVM', SVC(kernel='linear')),
    ('Naive Bayes', MultinomialNB()),
    ('Random Forest', RandomForestClassifier())
]

# Function to evaluate model using 5-fold CV
def evaluate_model(model_name, model_obj):
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('classifier', model_obj)
    ])
    scores = cross_val_score(pipeline, df['text'], df['target_encoded'], cv=5, scoring='accuracy')
    return model_name, scores.mean()

# Run 5-Fold CV for all models
print("📊 5-Fold Cross-Validation Accuracy:")
for name, model in models:
    model_name, avg_score = evaluate_model(name, model)
    print(f"{model_name}: {avg_score:.4f}")

# Leave-One-Plot-Out Cross-Validation using Logistic Regression
lopo = GroupKFold(n_splits=len(df['Plot_Name'].unique()))
logreg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression(max_iter=1000))
])
lopo_scores = cross_val_score(
    logreg_pipeline,
    df['text'],
    df['target_encoded'],
    cv=lopo.split(df['text'], df['target_encoded'], groups=df['Plot_Name'])
)

print(f"\n🔁 Leave-One-Plot-Out Accuracy (Logistic Regression): {lopo_scores.mean():.4f}")


📊 5-Fold Cross-Validation Accuracy:
Logistic Regression: 0.6846
SVM: 0.8385
Naive Bayes: 0.7308
Random Forest: 0.6692

🔁 Leave-One-Plot-Out Accuracy (Logistic Regression): 0.6796
