<a href="https://colab.research.google.com/github/Shalini-13/733-assignment/blob/main/classwork4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#Import required libraries
import pandas as pd
from sklearn.model_selection import cross_val_score, GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import string

#Load data
file_path = '/content/data_stories_one_shot.csv'
df = pd.read_csv(file_path)

#Step 1: Basic Cleaning (Lowercase + Punctuation Removal)
df['processed'] = df['Sentence'].apply(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))

#Step 2: Label Setup
df['label'] = df['Stage'].apply(lambda x: 'Show' if x == 1 else 'Tell')
df['label_encoded'] = LabelEncoder().fit_transform(df['label'])

#Step 3: Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel='linear'),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier()
}

#Step 4: TF-IDF Vectorization + 5-Fold Cross-Validation
results_cv = {}
for name, model in models.items():
    pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('clf', model)])
    scores = cross_val_score(pipeline, df['processed'], df['label_encoded'], cv=5, scoring='accuracy')
    results_cv[name] = scores.mean()

#Step 5: Leave-One-Plot-Out Cross-Validation (Logistic Regression)
group_kfold = GroupKFold(n_splits=len(df['Plot_Name'].unique()))
pipeline_lr = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression(max_iter=1000))])
scores_lopo = cross_val_score(
    pipeline_lr,
    df['processed'],
    df['label_encoded'],
    cv=group_kfold.split(df['processed'], df['label_encoded'], groups=df['Plot_Name'])
)

#Step 6: Display Results
print("📊 5-Fold Cross-Validation Accuracy:")
for model_name, acc in results_cv.items():
    print(f"{model_name}: {acc:.4f}")

print(f"\n🔁 Leave-One-Plot-Out Accuracy (Logistic Regression): {scores_lopo.mean():.4f}")

📊 5-Fold Cross-Validation Accuracy:
Logistic Regression: 0.6846
SVM: 0.8385
Naive Bayes: 0.7308
Random Forest: 0.6462

🔁 Leave-One-Plot-Out Accuracy (Logistic Regression): 0.6796
