In [4]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
csv_file = 'train.csv'  
try:
    df = pd.read_csv(csv_file)
except UnicodeDecodeError:
    df = pd.read_csv(csv_file, encoding='latin-1')

In [6]:
print("First rows:\n", df.head())
print("Columns:", df.columns.tolist())

First rows:
        textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment Time of Tweet Age of User  \
0  I`d have responded, if I were going   neutral       morning        0-20   
1                             Sooo SAD  negative          noon       21-30   
2                          bullying me  negative         night       31-45   
3                       leave me alone  negative       morning       46-60   
4                        Sons of ****,  negative          noon       60-70   

       Country  Population -2020  Land Area (Km²)  Density (P/Km²)  
0  Afghanistan          38928346        

In [7]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['clean_text'] = df['text'].astype(str).apply(clean_text)

In [8]:
# Overview: shape, dtypes, missing values
from IPython.display import display
print("Shape:", df.shape)
print("\nDtypes:\n", df.dtypes)
print("\nMissing values (count):\n", df.isna().sum().sort_values(ascending=False))


Shape: (27481, 11)

Dtypes:
 textID               object
text                 object
selected_text        object
sentiment            object
Time of Tweet        object
Age of User          object
Country              object
Population -2020      int64
Land Area (Km²)     float64
Density (P/Km²)       int64
clean_text           object
dtype: object

Missing values (count):
 text                1
selected_text       1
textID              0
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km²)     0
Density (P/Km²)     0
clean_text          0
dtype: int64


In [9]:
X = df['clean_text']
y = df['sentiment']

# 4. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 5. Vectorization
vectorizer = TfidfVectorizer(
    stop_words='english', max_features=2000, ngram_range=(1,2)
)


In [10]:
import pickle
import scipy.sparse as sp

# ...existing code...
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Save vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save vectorized data
sp.save_npz('X_train_vec.npz', X_train_vec)
sp.save_npz('X_test_vec.npz', X_test_vec)

# Save labels (optional but recommended)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

print("Saved: vectorizer.pkl, X_train_vec.npz, X_test_vec.npz, y_train.npy, y_test.npy")

Saved: vectorizer.pkl, X_train_vec.npz, X_test_vec.npz, y_train.npy, y_test.npy


In [11]:
# 6. Baseline models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
}


In [12]:
results = []
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append({'Model': name, 'Accuracy': acc, 'F1 Score': f1})

In [16]:
results_df = pd.DataFrame(results)
print('\nBaseline Model Performance:')
print(results_df)


Baseline Model Performance:
                 Model  Accuracy  F1 Score
0  Logistic Regression  0.682110  0.681885
1                  KNN  0.516434  0.497001
2        Decision Tree  0.640995  0.640509
3        Random Forest  0.688296  0.687325
4                  SVM  0.690358  0.689672


In [17]:
# Save results dataframe
results_df.to_csv('baseline_results.csv', index=False)
results_df.to_pickle('baseline_results.pkl')

print("\nSaved: baseline_results.csv, baseline_results.pkl")


Saved: baseline_results.csv, baseline_results.pkl


In [18]:
# Find and save the best performing model
best_model_name = results_df.loc[results_df['F1 Score'].idxmax(), 'Model']
best_model = models[best_model_name]

with open('best_baseline_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print(f"\nBest baseline model: {best_model_name}")
print(f"Saved as: best_baseline_model.pkl")


Best baseline model: SVM
Saved as: best_baseline_model.pkl
