# Sentiment Analysis - Production Ready

## Install Required Packages (uncomment if needed):
```bash
# For real datasets:
# pip install datasets
# pip install kaggle  # For Kaggle datasets

# For better models (optional):
# pip install transformers torch
# pip install nltk
```

In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import joblib

In [32]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv',
                 encoding='latin-1',
                 names=['sentiment', 'id', 'date', 'query', 'user', 'text'])
                 

In [33]:
df

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [34]:
df.isnull().sum()

sentiment    0
id           0
date         0
query        0
user         0
text         0
dtype: int64

In [35]:
df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})  # Map to 0 and 1
df = df.sample(n=100000, random_state=42)  # Sample 100k rows

In [36]:
import re

def clean_tweet(text):
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)            # Remove @mentions
    text = re.sub(r'#(\w+)', r'\1', text)       # Remove # but keep word
    text = text.lower()
    text = ' '.join(text.split())
    return text

df['text'] = df['text'].apply(clean_tweet)
print("Text cleaned!")


Text cleaned!


In [37]:
df
X = df['text']
y = df['sentiment']

In [38]:
df

Unnamed: 0,sentiment,id,date,query,user,text
541200,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,ahhh i hope your ok!!!
750,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"cool , i have no tweet apps for my razr 2"
766711,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,i know just family drama. its lame.hey next ti...
285055,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,school email won't open and i have geography s...
705995,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem
...,...,...,...,...,...,...
159217,0,1956786300,Thu May 28 22:41:13 PDT 2009,NO_QUERY,BelleDessy,finallytryingto download evernote on my bberry...
298540,0,1997706652,Mon Jun 01 17:19:04 PDT 2009,NO_QUERY,DanaLouLou,too late. guess i'm being stuffed in the trunk.
839945,1,1559579758,Sun Apr 19 11:30:43 PDT 2009,NO_QUERY,beex3,on the way to target with mah sistaaa
732586,0,2264101639,Sun Jun 21 02:47:18 PDT 2009,NO_QUERY,Sophs105,"hip is worse today, moving my leg causes pain...."


In [39]:
X

541200                               ahhh i hope your ok!!!
750               cool , i have no tweet apps for my razr 2
766711    i know just family drama. its lame.hey next ti...
285055    school email won't open and i have geography s...
705995                                upper airways problem
                                ...                        
159217    finallytryingto download evernote on my bberry...
298540      too late. guess i'm being stuffed in the trunk.
839945                on the way to target with mah sistaaa
732586    hip is worse today, moving my leg causes pain....
429504              1 more week? ( what to do? what to do??
Name: text, Length: 100000, dtype: object

In [40]:
y

541200    0
750       0
766711    0
285055    0
705995    0
         ..
159217    0
298540    0
839945    1
732586    0
429504    0
Name: sentiment, Length: 100000, dtype: int64

# TEXT Vectorization

In [41]:
 # Split data: 80% training, 20% testing

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42,stratify =y) 

In [42]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    min_df=5,
    max_df=0.95
)

X_vectorized = vectorizer.fit_transform(X_train)


# Train Model

In [43]:
classifier = LogisticRegression(
    max_iter = 1000
)


classifier.fit(X_vectorized,y_train)
########################################################


# Optimized for text classification
# from xgboost import XGBClassifier

# classifier = XGBClassifier(
#     n_estimators=200,
#     max_depth=6,
#     learning_rate=0.1,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     gamma=1,
#     reg_alpha=0.1,
#     reg_lambda=1,
#     random_state=42,
#     objective='binary:logistic',  # For 3-class classification
#     # num_class=3,                 # Number of classes
#     eval_metric='logloss',
#     n_jobs=-1
# )

# classifier.fit(X_vectorized, y_train)
########################################################

########################################################


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


# Test Predictions

In [44]:
test_vectorized = vectorizer.transform(X_test)
y_pred = classifier.predict(test_vectorized)
# y_pred = y_pred_proba.argmax(axis=1)
y_pred


# print(f"Predictions shape: {y_pred.shape}")
# print(f"Unique predictions: {np.unique(y_pred)}")
# print(f"Sample predictions: {y_pred[:10]}")

array([0, 0, 0, ..., 1, 1, 0])

In [45]:
from sklearn.metrics import classification_report, accuracy_score


print("Accuracy Score: {:.2f}".format(accuracy_score(y_pred,y_test) * 100 ) )
print(classification_report(y_test, y_pred,
                        target_names=['Negative (0)','Positive (1)']))


Accuracy Score: 78.67
              precision    recall  f1-score   support

Negative (0)       0.79      0.78      0.78      9989
Positive (1)       0.78      0.80      0.79     10011

    accuracy                           0.79     20000
   macro avg       0.79      0.79      0.79     20000
weighted avg       0.79      0.79      0.79     20000



In [46]:
joblib.dump( classifier ,
                'sentimental_model.pkl' )

joblib.dump( vectorizer,
                'vectorizer.pkl' )                

['vectorizer.pkl']

In [None]:
classifier.predict(
    vectorizer.transform(['I hate twitter' ])
) 

array([0])