<a href="https://colab.research.google.com/github/NotBinit116/AML-Sentiment/blob/main/Sentiment140_Sentiment_SVM_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setups


In [None]:
# Core libraries
import pandas as pd
import numpy as np

# ML & NLP
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Optional: visualization
import matplotlib.pyplot as plt
import seaborn as sns


Loading the Dataset


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")

print("Path to dataset files:", path)


Using Colab cache for faster access to the 'sentiment140' dataset.
Path to dataset files: /kaggle/input/sentiment140


In [None]:
df = pd.read_csv(
    path + "/training.1600000.processed.noemoticon.csv",
    encoding="latin-1",
    header=None
)

Checking the no. of labels for each


In [None]:
df.columns = [
    "label",    # 0 = negative, 4 = positive
    "id",
    "date",
    "query",
    "user",
    "text"
]
df = df[["text", "label"]]


In [None]:
df["label"] = df["label"].map({
    0: "negative",
    4: "positive"
})

df["label"].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
negative,800000
positive,800000


In [None]:
import re

def clean_tweet(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    return text.lower()

Downsampling for Speed

In [None]:
df = df.sample(n=500000, random_state=42)

df['text'] = df['text'].apply(clean_tweet)

In [None]:
X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


Using the Tf-IDF vectorizer


In [None]:
tfidf = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3,5),
    min_df=3,
    sublinear_tf=True
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

USing Logistice Regression

In [None]:
svm_model = LinearSVC(
    C=1.0,          # lower is better for char n-grams
    max_iter=5000
)

svm_model.fit(X_train_tfidf, y_train)

Checking Accuracy for the model

In [None]:

y_pred = svm_model.predict(X_test_tfidf)
print("Accuracy (SVM + Char):", accuracy_score(y_test, y_pred))

Accuracy (SVM + Char): 0.80764


Visualizations

In [None]:
'''cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d',
            xticklabels=nb_char.classes_,
            yticklabels=nb_char.classes_,
            cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix – IMDB Baseline Model")
plt.show() '''

'cm = confusion_matrix(y_test, y_pred)\n\nplt.figure(figsize=(6,4))\nsns.heatmap(cm, annot=True, fmt=\'d\',\n            xticklabels=nb_char.classes_,\n            yticklabels=nb_char.classes_,\n            cmap=\'Blues\')\nplt.xlabel("Predicted")\nplt.ylabel("Actual")\nplt.title("Confusion Matrix – IMDB Baseline Model")\nplt.show() '