# Sentiment Analysis - Production Ready

## Install Required Packages (uncomment if needed):
```bash
# For real datasets:
# pip install datasets
# pip install kaggle  # For Kaggle datasets

# For better models (optional):
# pip install transformers torch
# pip install nltk
```

In [143]:
# Install required packages
# !pip install transformers torch scikit-learn accelerate -q

In [144]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import joblib
from datasets import load_dataset
dataset = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")



In [145]:
df = dataset['train'].to_pandas()
df

Unnamed: 0,id,text,label,sentiment
0,9536,"Cooking microwave pizzas, yummy",2,positive
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive
3,14182,naw idk what ur talkin about,1,neutral
4,17840,That sucks to hear. I hate days like that,0,negative
...,...,...,...,...
31227,6265,Grrrr....I got the wrong size coat for the sheep,0,negative
31228,11284,4 cases of swine flu!,1,neutral
31229,6436,excellent,2,positive
31230,860,is sitting thru the boring bits in Titanic wai...,1,neutral


In [146]:
df.isnull().sum()

id           0
text         0
label        0
sentiment    0
dtype: int64

In [147]:
df = df.rename(columns={'label' : 'sentiments'})
df = df[ ['text' , 'sentiments' ] ]
df

Unnamed: 0,text,sentiments
0,"Cooking microwave pizzas, yummy",2
1,Any plans of allowing sub tasks to show up in ...,1
2,"I love the humor, I just reworded it. Like sa...",2
3,naw idk what ur talkin about,1
4,That sucks to hear. I hate days like that,0
...,...,...
31227,Grrrr....I got the wrong size coat for the sheep,0
31228,4 cases of swine flu!,1
31229,excellent,2
31230,is sitting thru the boring bits in Titanic wai...,1


In [148]:
import re

def clean_tweet(text):
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)            # Remove @mentions
    text = re.sub(r'#(\w+)', r'\1', text)       # Remove # but keep word
    text = text.lower()
    text = ' '.join(text.split())
    return text

df['text'] = df['text'].apply(clean_tweet)
print("Text cleaned!")


Text cleaned!


In [149]:
df
X = df['text']
y = df['sentiments']

In [150]:
df['sentiments'].value_counts()

sentiments
1    11649
2    10478
0     9105
Name: count, dtype: int64

In [151]:
X

0                          cooking microwave pizzas, yummy
1        any plans of allowing sub tasks to show up in ...
2        i love the humor, i just reworded it. like say...
3                             naw idk what ur talkin about
4                that sucks to hear. i hate days like that
                               ...                        
31227     grrrr....i got the wrong size coat for the sheep
31228                                4 cases of swine flu!
31229                                            excellent
31230    is sitting thru the boring bits in titanic wai...
31231                                      missed the play
Name: text, Length: 31232, dtype: object

In [152]:
y

0        2
1        1
2        2
3        1
4        0
        ..
31227    0
31228    1
31229    2
31230    1
31231    0
Name: sentiments, Length: 31232, dtype: int64

# TEXT Vectorization

In [153]:
 # Split data: 80% training, 20% testing

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42,stratify =y) 

In [154]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    min_df=5,
    max_df=0.95
)

X_vectorized = vectorizer.fit_transform(X_train)


# vectorizer = TfidfVectorizer(
#       max_features=10000,        # Increased from 5000
#       ngram_range=(1, 3),        # Include trigrams
#       min_df=2,                  # Reduced from 5 (keep more features)
#       max_df=0.9,                # Reduced from 0.95 (filter common words)
#       sublinear_tf=True,         # Use log scaling
#       strip_accents='unicode',   # Handle accents
#       token_pattern=r'\b\w+\b',  # Better tokenization
#       use_idf=True,              # Enable IDF
#       smooth_idf=True            # Smooth IDF weights
#   )

# X_vectorized = vectorizer.fit_transform(X_train)
# print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

# Train Model

In [155]:
classifier = LogisticRegression(
    max_iter = 1000,
    multi_class ='multinomial',
    solver='lbfgs'
)


classifier.fit(X_vectorized,y_train)
########################################################



# from xgboost import XGBClassifier

# classifier = XGBClassifier(
#     n_estimators=300,           # Increased from 200
#     max_depth=7,                # Slightly deeper
#     learning_rate=0.05,         # Lower for better generalization
#     subsample=0.8,
#     colsample_bytree=0.8,
#     gamma=1,
#     reg_alpha=0.5,              # Increased L1 regularization
#     reg_lambda=2,               # Increased L2 regularization
#     min_child_weight=3,
#     random_state=42,
#     objective='multi:softmax',  # ✅ FIXED for 3-class
#     num_class=3,                # ✅ ADDED
#     eval_metric='mlogloss',     # ✅ FIXED
#     n_jobs=-1,
#     tree_method='hist'          # Faster training
# )

# classifier.fit(X_vectorized, y_train)
# print("XGBoost training complete!")
########################################################

########################################################




0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


# Test Predictions

In [156]:
test_vectorized = vectorizer.transform(X_test)
y_pred = classifier.predict(test_vectorized)
# y_pred = y_pred_proba.argmax(axis=1)
y_pred


# print(f"Predictions shape: {y_pred.shape}")
# print(f"Unique predictions: {np.unique(y_pred)}")
# print(f"Sample predictions: {y_pred[:10]}")

array([2, 1, 1, ..., 0, 2, 1])

In [157]:
from sklearn.metrics import classification_report, accuracy_score


print("Accuracy Score: {:.2f}".format(accuracy_score(y_pred,y_test) * 100 ) )
print(classification_report(y_test, y_pred,
                        target_names=['Negative (0)', 'Neutral (1)' ,'Positive (2)']))


Accuracy Score: 66.75
              precision    recall  f1-score   support

Negative (0)       0.68      0.61      0.64      1821
 Neutral (1)       0.59      0.66      0.63      2330
Positive (2)       0.76      0.72      0.74      2096

    accuracy                           0.67      6247
   macro avg       0.68      0.67      0.67      6247
weighted avg       0.67      0.67      0.67      6247



In [158]:
joblib.dump( classifier ,
                'sentimental_model.pkl' )

joblib.dump( vectorizer,
                'vectorizer.pkl' )                

['vectorizer.pkl']

In [159]:
classifier.predict(
    vectorizer.transform(['twitter is awesome' ])
) 

array([2])