<a href="https://colab.research.google.com/github/ParthKhiriya/Sentiment_Analysis_PRML_Project/blob/3-model-branch/Sentiment_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import re
import string
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Installing the kaggle library
! pip install kaggle



In [4]:
# configuring the path of kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
# Fetching the API to download the dataset
! kaggle datasets download -d abhi8923shriv/sentiment-analysis-dataset

Dataset URL: https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset
License(s): CC0-1.0


In [6]:
from zipfile import ZipFile
dataset = '/content/sentiment-analysis-dataset.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [7]:
# Loading the main dataset and also the train and test set
df_train = pd.read_csv('train.csv', encoding='latin-1', header= None)
df_train = df_train.drop(labels=0)
df_train.columns = ["textID","text","selected_text","sentiment","Time of Tweet","Age of User","Country","Population -2020","Land Area (Km²)","Density (P/Km²)"]

# This gives the information about our data,  like what are the datatypes of content present in all the columns and also the null count
df_train = df_train.drop(["textID","selected_text","Time of Tweet","Age of User","Country","Population -2020","Land Area (Km²)","Density (P/Km²)"], axis=1)
df_train = df_train.dropna()
print(df_train)

                                                    text sentiment
1                    I`d have responded, if I were going   neutral
2          Sooo SAD I will miss you here in San Diego!!!  negative
3                              my boss is bullying me...  negative
4                         what interview! leave me alone  negative
5       Sons of ****, why couldn`t they put them on t...  negative
...                                                  ...       ...
27477   wish we could come see u on Denver  husband l...  negative
27478   I`ve wondered about rake to.  The client has ...  negative
27479   Yay good for both of you. Enjoy the break - y...  positive
27480                         But it was worth it  ****.  positive
27481     All this flirting going on - The ATG smiles...   neutral

[27480 rows x 2 columns]


In [8]:
# Loading the main dataset and also the train and test set
df_test = pd.read_csv('test.csv', encoding='latin-1', header= None)
df_test = df_test.drop(labels=0)
df_test.columns = ["textID","text","sentiment","Time of Tweet","Age of User","Country","Population -2020","Land Area (Km²)","Density (P/Km²)"]

# This gives the information about our data,  like what are the datatypes of content present in all the columns and also the null count
df_test = df_test.drop(["textID","Time of Tweet","Age of User","Country","Population -2020","Land Area (Km²)","Density (P/Km²)"], axis=1)
df_test = df_test.dropna()
print(df_test)

                                                   text sentiment
1     Last session of the day  http://twitpic.com/67ezh   neutral
2      Shanghai is also really exciting (precisely -...  positive
3     Recession hit Veronique Branquinho, she has to...  negative
4                                           happy bday!  positive
5                http://twitpic.com/4w75p - I like it!!  positive
...                                                 ...       ...
3530  its at 3 am, im very tired but i can`t sleep  ...  negative
3531  All alone in this old house again.  Thanks for...  positive
3532   I know what you mean. My little dog is sinkin...  negative
3533  _sutra what is your next youtube video gonna b...  positive
3534   http://twitpic.com/4woj2 - omgssh  ang cute n...  positive

[3534 rows x 2 columns]


In [9]:
df = pd.concat([df_train, df_test], ignore_index=True)

# Shuffle the dataset (good practice before splitting again)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Showing basic info
print(df.shape)
print(df.head())

(31014, 2)
                                                text sentiment
0           I was afraid you were going to say that.  negative
1   part 2: social networking??.. there is even r...   neutral
2  i miss the one who would do anything to spend ...  negative
3    tee we beefin....what was u supposed to do b...   neutral
4              Happy Hug Your Mom Day!! love you mom  positive


In [10]:
sentiment_ordering = ['negative', 'neutral', 'positive']

df["sentiment"] = df["sentiment"].apply(lambda x: sentiment_ordering.index(x))
print(df)

                                                    text  sentiment
0               I was afraid you were going to say that.          0
1       part 2: social networking??.. there is even r...          1
2      i miss the one who would do anything to spend ...          0
3        tee we beefin....what was u supposed to do b...          1
4                  Happy Hug Your Mom Day!! love you mom          2
...                                                  ...        ...
31009                 thanks to  and i`m now on twitter!          2
31010              My best friend is in vegas without me          1
31011  http://twitpic.com/4jken - fire and urban at r...          1
31012          A+ for effort though  http://bit.ly/Mco5v          2
31013  claire  love the show, got into the office @ 5...          2

[31014 rows x 2 columns]


In [11]:
df['sentiment'].value_counts()

# Since value_counts() is showing two different types of 0s so we have to check what is the problem
print(df['sentiment'].unique())
print(df['sentiment'].value_counts())

[0 1 2]
sentiment
1    12547
2     9685
0     8782
Name: count, dtype: int64


In [12]:
# This function performs basic tasks for cleaning the data such as lowercasing, URL handling etc.
def clean_text(text):

    text = text.lower()  # Lowercasing
    text = re.sub(r'http\S+|www.\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+|#\w+', '', text)  # Remove mentions and hashtags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text.strip()

# Save the cleaned text in our dataframe
df['cleaned_text'] = df['text'].apply(clean_text)

# Tokenisation - splits the dataset into an array of words(nltk generally performs word tokenisation i.e. seperating words) for better handling of punctuations
df['tokens'] = df['cleaned_text'].apply(lambda x: x.split())

# Removing all the stopwords because it will not impact the sentiment of the tweet
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Then stemming(converting a word into its root word e.g. acting, actor to act)
stemmer = PorterStemmer()
df['stemmed_text'] = df['tokens'].apply(lambda tokens: ' '.join([stemmer.stem(word) for word in tokens]))

print(df)

                                                    text  sentiment  \
0               I was afraid you were going to say that.          0   
1       part 2: social networking??.. there is even r...          1   
2      i miss the one who would do anything to spend ...          0   
3        tee we beefin....what was u supposed to do b...          1   
4                  Happy Hug Your Mom Day!! love you mom          2   
...                                                  ...        ...   
31009                 thanks to  and i`m now on twitter!          2   
31010              My best friend is in vegas without me          1   
31011  http://twitpic.com/4jken - fire and urban at r...          1   
31012          A+ for effort though  http://bit.ly/Mco5v          2   
31013  claire  love the show, got into the office @ 5...          2   

                                            cleaned_text  \
0                i was afraid you were going to say that   
1      part  social network

In [13]:
from sklearn.model_selection import train_test_split

X = df['stemmed_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify= y, random_state=42)

print(X.shape, X_train.shape, X_test.shape)

(31014,) (24811,) (6203,)


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(f"Training Data:\n {X_train}")
print(f"Testing Data:\n {X_test}")

Training Data:
 <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 170788 stored elements and shape (24811, 19951)>
  Coords	Values
  (0, 6746)	0.1576184745171829
  (0, 3788)	0.30025083424054577
  (0, 16495)	0.2986435589723957
  (0, 7453)	0.2531550567464741
  (0, 121)	0.3533842769946217
  (0, 7564)	0.2922776098063534
  (0, 9555)	0.36336571468762907
  (0, 1611)	0.3680785975657046
  (0, 15360)	0.2720598561485078
  (0, 11114)	0.21876023862030103
  (0, 335)	0.36336571468762907
  (1, 6746)	0.17737105739448014
  (1, 7677)	0.4899160560389565
  (1, 16988)	0.5248067699449726
  (1, 19925)	0.5248067699449726
  (1, 19252)	0.3226354636996705
  (1, 14776)	0.27126339973749825
  (2, 16960)	0.1360862675700317
  (2, 8547)	0.324737343480285
  (2, 9034)	0.324737343480285
  (2, 655)	0.5328723799301739
  (2, 2389)	0.324737343480285
  (2, 7721)	0.324737343480285
  (2, 6643)	0.324737343480285
  (2, 6640)	0.296197571872595
  :	:
  (24807, 4476)	0.5047932663896494
  (24807, 5426)	0.5047932663896494
 

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred_train_logistic = log_reg.predict(X_train)
accuracy_train_logistic = accuracy_score(y_train, y_pred_train_logistic)
print(f"Training accuracy : {accuracy_train_logistic}")

y_pred_test_logistic = log_reg.predict(X_test)
accuracy_test_logistic = accuracy_score(y_test, y_pred_test_logistic)
print(f"Testing accuracy : {accuracy_test_logistic}")

Training accuracy : 0.8062955946959011
Testing accuracy : 0.6948250846364662


In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

y_pred_train_linear = lin_reg.predict(X_train)
r2_train_linear = accuracy_score(y_train, y_pred_train_linear)
print(f"Training accuracy : {r2_train_linear}")

y_pred_test_linear = lin_reg.predict(X_test)
r2_test_linear = accuracy_score(y_test, y_pred_test_linear)
print(f"Testing accuracy : {r2_test_linear}")

Training accuracy : 0.8414899403815954
Testing accuracy : -0.03909002572318632


In [17]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))


SVM Accuracy: 0.7114299532484282


In [18]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

y_pred_nb = nb_model.predict(X_test)
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))


Naïve Bayes Accuracy: 0.6167983233919071


In [19]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.7061099467999356


In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Initialize the Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=42)

# Train the model
dt_clf.fit(X_train, y_train)

# Predictions
y_pred_train_dt = dt_clf.predict(X_train)
y_pred_test_dt = dt_clf.predict(X_test)

# Evaluate the model
accuracy_train_dt = accuracy_score(y_train, y_pred_train_dt)
accuracy_test_dt = accuracy_score(y_test, y_pred_test_dt)

print(f"Decision Tree Training Accuracy: {accuracy_train_dt}")
print(f"Decision Tree Testing Accuracy: {accuracy_test_dt}")

# Classification report
print("Classification Report for Decision Tree on Test Set:")
print(classification_report(y_test, y_pred_test_dt))

Decision Tree Training Accuracy: 0.9971383660473178
Decision Tree Testing Accuracy: 0.6521038207319039
Classification Report for Decision Tree on Test Set:
              precision    recall  f1-score   support

           0       0.63      0.58      0.60      1756
           1       0.63      0.65      0.64      2510
           2       0.70      0.72      0.71      1937

    accuracy                           0.65      6203
   macro avg       0.65      0.65      0.65      6203
weighted avg       0.65      0.65      0.65      6203



In [21]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Accuracy: 0.7091729808157343


In [30]:
import joblib

# Save existing models
joblib.dump(log_reg, 'log_reg.pkl')
joblib.dump(lin_reg, 'lin_reg.pkl')
joblib.dump(nb_model, 'nb_model.pkl')
joblib.dump(xgb_model, 'xgb_model.pkl')
joblib.dump(rf_model, 'rf_model.pkl')
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(dt_clf, 'dt_model.pkl')



# Save the shared vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [31]:
from google.colab import files

# Existing models
files.download('log_reg.pkl')
files.download('lin_reg.pkl')
files.download('nb_model.pkl')
files.download('xgb_model.pkl')
files.download('rf_model.pkl')
files.download('svm_model.pkl')
files.download('dt_model.pkl')

# Shared vectorizer
files.download('vectorizer.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [34]:
from sklearn.metrics import accuracy_score, r2_score

# Existing models
y_pred_log = log_reg.predict(X_test)
acc_log = accuracy_score(y_test, y_pred_log)

y_pred_lin = lin_reg.predict(X_test)
acc_lin = r2_score(y_test, y_pred_lin)  # Regression model

y_pred_nb = nb_model.predict(X_test)
acc_nb = accuracy_score(y_test, y_pred_nb)

y_pred_xgb = xgb_model.predict(X_test)
acc_xgb = accuracy_score(y_test, y_pred_xgb)

y_pred_rf = rf_model.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

y_pred_svm = svm_model.predict(X_test)
acc_svm = accuracy_score(y_test, y_pred_svm)

y_pred_dt = dt_clf.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)


In [35]:
accuracies = {
    'Logistic Regression': acc_log,
    'Linear Regression (R²)': acc_lin,
    'Naive Bayes': acc_nb,
    'XGBoost': acc_xgb,
    'Random Forest': acc_rf,
    'SVM': acc_svm,
    'Decision Tree': acc_dt
}

# Save as pkl
joblib.dump(accuracies, 'accuracies.pkl')


# For downloading in Jupyter Notebook
from google.colab import files
files.download('accuracies.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>