In [1]:
import pandas as pd

In [11]:
df = pd.read_csv("train.csv", encoding="latin1")

In [13]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [17]:
df.columns

Index(['textID', 'text', 'selected_text', 'sentiment', 'Time of Tweet',
       'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)',
       'Density (P/Km²)'],
      dtype='object')

In [19]:
df.isnull().sum()

textID              0
text                1
selected_text       1
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km²)     0
Density (P/Km²)     0
dtype: int64

In [25]:
df.dropna(inplace=True)

In [27]:
import re

In [31]:
def clean(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [33]:
df["clean_text"] = df["text"].apply(clean)

In [35]:
from sklearn.model_selection import train_test_split

In [39]:
X = df["clean_text"]
y = df["sentiment"]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    stop_words='english'
)

In [47]:
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

In [49]:
from sklearn.linear_model import LogisticRegression

In [51]:
model = LogisticRegression(max_iter=200)
model.fit(X_train_vec, y_train)

In [53]:
from sklearn.metrics import classification_report, accuracy_score

In [55]:
y_pred = model.predict(X_test_vec)

In [57]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6859534206695779
              precision    recall  f1-score   support

    negative       0.72      0.60      0.65      1556
     neutral       0.62      0.75      0.68      2223
    positive       0.78      0.69      0.73      1717

    accuracy                           0.69      5496
   macro avg       0.70      0.68      0.69      5496
weighted avg       0.70      0.69      0.69      5496



In [59]:
def predict_sentiment(review):
    review = clean(review)
    vec = tfidf.transform([review])
    return model.predict(vec)[0]

In [71]:
print(predict_sentiment("This product is amazing!"))
print(predict_sentiment("Waste of money"))

positive
negative


In [77]:
import numpy as np

feature_names = np.array(tfidf.get_feature_names_out())
coef = model.coef_[0]

top_negative_words = feature_names[np.argsort(coef)][-20:]
top_positive_words = feature_names[np.argsort(coef)][:20]

print("Top NEGATIVE words:", top_negative_words)
print("Top POSITIVE words:", top_positive_words)


Top NEGATIVE words: ['worst' 'ugh' 'sadly' 'missed' 'missing' 'boring' 'poor' 'fail' 'hurts'
 'headache' 'bad' 'bored' 'tired' 'sick' 'stupid' 'sucks' 'hate' 'miss'
 'sorry' 'sad']
Top POSITIVE words: ['love' 'awesome' 'thanks' 'glad' 'hope' 'thank' 'great' 'hopefully'
 'nice' 'amazing' 'best' 'cute' 'beautiful' 'welcome' 'excited' 'good'
 'yay' 'lovely' 'interesting' 'happy']
