# **Mount the Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Import the libraries**

In [None]:
#import the libraries
import pandas as pd
import numpy as np

In [None]:
tweetdata = pd.read_csv("/content/drive/MyDrive/DSL Project/Datasets/twitter_training_10000_new.csv")

print("---------------------Tweet data---------------------\n")
print(tweetdata.head(10))

---------------------Tweet data---------------------

  sentiment                                               text
0  Positive                    I am so happy to see you again 
1  Positive                               what a wonderful day
2  Positive  im getting on borderlands and i will murder yo...
3  Positive  I am coming to the borders and I will kill you...
4  Positive  im getting on borderlands and i will kill you ...
5  Positive  im coming on borderlands and i will murder you...
6  Positive  im getting on borderlands 2 and i will murder ...
7  Positive  im getting into borderlands and i can murder y...
8  Positive  So I spent a few hours making something for fu...
9  Positive  So I spent a couple of hours doing something f...


# **Know the datsets**

In [None]:
#check for the shape of the datset
print("Tweet datset : " ,tweetdata.shape)

Tweet datset :  (9999, 2)


# **Cleaning the datasetS (Base Level Preprocessing)**

In [None]:
#check for null dataset
print("Tweet datset : " ,tweetdata.isnull().sum())

Tweet datset :  sentiment      0
text         112
dtype: int64


In [None]:
#displaying the row which contains the null values
tweetdata_row_with_null = tweetdata[tweetdata.isnull().any(axis=1)]
print("--------------------Tweet dataset-------------------------\n")
print(tweetdata_row_with_null)

--------------------Tweet dataset-------------------------

       sentiment text
63       Neutral  NaN
555      Neutral  NaN
591      Neutral  NaN
747     Positive  NaN
1107    Positive  NaN
...          ...  ...
9933    Negative  NaN
9934    Negative  NaN
9989  Irrelevant  NaN
9990  Irrelevant  NaN
9991  Irrelevant  NaN

[112 rows x 2 columns]


In [None]:
#drop the null values
cleaned_tweetdata = tweetdata.dropna()

print("cleaned_tweetdata : ",cleaned_tweetdata.isnull().sum())

cleaned_tweetdata :  sentiment    0
text         0
dtype: int64


In [None]:
print(cleaned_tweetdata.head(10))

  sentiment                                               text
0  Positive                    I am so happy to see you again 
1  Positive                               what a wonderful day
2  Positive  im getting on borderlands and i will murder yo...
3  Positive  I am coming to the borders and I will kill you...
4  Positive  im getting on borderlands and i will kill you ...
5  Positive  im coming on borderlands and i will murder you...
6  Positive  im getting on borderlands 2 and i will murder ...
7  Positive  im getting into borderlands and i can murder y...
8  Positive  So I spent a few hours making something for fu...
9  Positive  So I spent a couple of hours doing something f...



In [None]:
#checking the shape of cleaned datasets
print("Cleaned Tweet dataset : ",cleaned_tweetdata.shape)

Cleaned Tweet dataset :  (9887, 2)


In [None]:
#Comparing the shape of old and new datasets
print(tweetdata.shape,"\t",cleaned_tweetdata.shape)

(9999, 2) 	 (9887, 2)


# **Advanced data preprocessing by text preprocessing**


1.   Removing Special Characters and Punctuation
2.   Converting Text to Lowercase
3. Removing Stop Words
4. Stemming or Lemmatization



In [None]:
#import the required libraries for text processing
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import datetime  # Import the datetime library

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Define a function for text preprocessing
def preprocess_text(text):
    # Remove special characters and punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Tokenization
    tokens = word_tokenize(text)

    # Convert to lowercase
    tokens = [word.lower() for word in tokens]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Stemming (optional)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    return ' '.join(stemmed_tokens)

In [None]:
# Apply the preprocessing function to the entire 'text' column in your DataFrame
#measuring the time for the tweetdataset
start_time1 = datetime.datetime.now()
cleaned_tweetdata.loc[:, 'text'] = cleaned_tweetdata['text'].apply(preprocess_text)
end_time1 = datetime.datetime.now()

#calculating the execution time for the tweet dataset
execution_time1 = end_time1 - start_time1

#displaying the execution time for the both datasets
exe_time_tweetdata = execution_time1

print("Execution time for the cleaned_tweetdata : ", exe_time_tweetdata)

Execution time for the cleaned_tweetdata :  0:00:07.009967


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_tweetdata.loc[:, 'text'] = cleaned_tweetdata['text'].apply(preprocess_text)


In [None]:
# Print the cleaned DataFrame
print(cleaned_tweetdata.head(10))

  sentiment                                               text
0  Positive                                          happi see
1  Positive                                         wonder day
2  Positive                           im get borderland murder
3  Positive                                   come border kill
4  Positive                             im get borderland kill
5  Positive                          im come borderland murder
6  Positive                         im get borderland 2 murder
7  Positive                           im get borderland murder
8  Positive  spent hour make someth fun dont know huge bord...
9  Positive  spent coupl hour someth fun dont know im huge ...


# **Split the datasets into train and test**

In [None]:
#import the libraries
import pandas as pd
from sklearn.model_selection import train_test_split

# Define the features (X) and the target labels (y)
X1 = cleaned_tweetdata['text']
y1 = cleaned_tweetdata['sentiment']

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.15, random_state=30)
X_train1, X_val1, y_train1, y_val1 = train_test_split(X_train1, y_train1, test_size=0.15, random_state=30)

# Print the sizes of the subsets
print("------------------------------------Size of the datasets of the Tweet Datasets--------------------------------------\n")
print(f"Training set size: {len(X_train1)} samples")
print(f"Validation set size: {len(X_val1)} samples")
print(f"Test set size: {len(X_test1)} samples")

------------------------------------Size of the datasets of the Tweet Datasets--------------------------------------

Training set size: 7142 samples
Validation set size: 1261 samples
Test set size: 1484 samples


# **Vectorization**

In [None]:
#import the libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Vectorize the text data using TF-IDF(cleaned_tweetdata)
vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the max_features parameter
X_train1_tfidf = vectorizer.fit_transform(X_train1)
X_val1_tfidf = vectorizer.transform(X_val1)
X_test1_tfidf = vectorizer.transform(X_test1)

#pickeling
# Fit the vectorizer with your training data (X_train1 and X_train2)
vectorizer.fit(X_train1)  # Fit with cleaned_tweetdata training data

# Save the fitted vectorizer
with open("tfidf_10000_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

# **Logistic Regression Model**

In [None]:
#import the libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train a logistic regression model(cleaned_tweetdata)
Logistic_Tweet_model= LogisticRegression(max_iter=200, random_state=100)
Logistic_Tweet_model.fit(X_train1_tfidf, y_train1)

# Make predictions on the test set
y_test1_pred = Logistic_Tweet_model.predict(X_test1_tfidf)

# Evaluate the model on the test set
accuracy1_logistic = accuracy_score(y_test1, y_test1_pred)
report1_logistic = classification_report(y_test1, y_test1_pred)
conf_matrix1_logistic = confusion_matrix(y_test1, y_test1_pred)

#printing the results
print("------------------------------------------Model evaluation with the cleaned_tweetdata (logistic)----------------------------------------------------\n")
print("\n-----------------------------------Accuracy---------------------------------------\n")
print(f"Test Accuracy: {accuracy1_logistic}")
print("\n----------------------Classification report---------------------------------------\n")
print("Classification Report:\n", report1_logistic)
print("\n-------------------------Confusion Matrix---------------------------------------\n")
print("Confusion Matrix:\n", conf_matrix1_logistic)
print("\n-----------------------------------------Model evaluation with the cleaned_tweetdata ends (logistic)-----------------------------------------------\n")

------------------------------------------Model evaluation with the cleaned_tweetdata (logistic)----------------------------------------------------


-----------------------------------Accuracy---------------------------------------

Test Accuracy: 0.6913746630727763

----------------------Classification report---------------------------------------

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.69      0.53      0.60       283
    Negative       0.71      0.73      0.72       347
     Neutral       0.75      0.61      0.67       391
    Positive       0.65      0.82      0.73       463

    accuracy                           0.69      1484
   macro avg       0.70      0.68      0.68      1484
weighted avg       0.70      0.69      0.69      1484


-------------------------Confusion Matrix---------------------------------------

Confusion Matrix:
 [[150  30  31  72]
 [ 17 255  25  50]
 [ 22  44 240  85]
 [ 27  29  26 381]]

-------

# **Support Vector Machine (SVM)**

In [None]:
from sklearn.svm import SVC

# Train an SVM classifier for cleaned_tweetdata
Tweet_svm_model = SVC(kernel='linear', C=1.0)  # You can experiment with different kernels and C values
Tweet_svm_model.fit(X_train1_tfidf, y_train1)

# Make predictions on the test set for cleaned_tweetdata
y_test1_svm_pred = Tweet_svm_model.predict(X_test1_tfidf)

# Evaluate the SVM model on the test set for cleaned_tweetdata
accuracy1_svm = accuracy_score(y_test1, y_test1_svm_pred)
report1_svm = classification_report(y_test1, y_test1_svm_pred)
conf_matrix1_svm = confusion_matrix(y_test1, y_test1_svm_pred)

# Printing the results for SVM
print("------------------------------------------Model evaluation with the cleaned_tweetdata (SVM)----------------------------------------------------\n")
print("\n-----------------------------------Accuracy---------------------------------------\n")
print(f"Test Accuracy: {accuracy1_svm}")
print("\n----------------------Classification report---------------------------------------\n")
print("Classification Report:\n", report1_svm)
print("\n-------------------------Confusion Matrix---------------------------------------\n")
print("Confusion Matrix:\n", conf_matrix1_svm)
print("\n-----------------------------------------Model evaluation with the cleaned_tweetdata (SVM) ends-----------------------------------------------\n")

------------------------------------------Model evaluation with the cleaned_tweetdata (SVM)----------------------------------------------------


-----------------------------------Accuracy---------------------------------------

Test Accuracy: 0.7115902964959568

----------------------Classification report---------------------------------------

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.69      0.59      0.63       283
    Negative       0.74      0.76      0.75       347
     Neutral       0.81      0.62      0.70       391
    Positive       0.66      0.83      0.73       463

    accuracy                           0.71      1484
   macro avg       0.72      0.70      0.70      1484
weighted avg       0.72      0.71      0.71      1484


-------------------------Confusion Matrix---------------------------------------

Confusion Matrix:
 [[166  26  20  71]
 [ 19 262  19  47]
 [ 26  39 244  82]
 [ 30  29  20 384]]

------------

# **Random Forest Method**

In [None]:
#import the libraries
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model (cleaned_tweetdata)
RandomForest_Tweet_model = RandomForestClassifier(n_estimators=100, random_state=30)
RandomForest_Tweet_model.fit(X_train1_tfidf, y_train1)

# Make predictions on the test set
y_test1_pred = RandomForest_Tweet_model.predict(X_test1_tfidf)

# Evaluate the model on the test set
accuracy1_rf = accuracy_score(y_test1, y_test1_pred)
report1_rf = classification_report(y_test1, y_test1_pred)
conf_matrix1_rf = confusion_matrix(y_test1, y_test1_pred)

# Printing the results for Random Forest
print("------------------------------------------Model evaluation with the cleaned_tweetdata (Random Forest)----------------------------------------------------\n")
print("\n-----------------------------------Accuracy---------------------------------------\n")
print(f"Test Accuracy: {accuracy1_rf:.4f}")
print("\n----------------------Classification report---------------------------------------\n")
print("Classification Report:\n", report1_rf)
print("\n-------------------------Confusion Matrix---------------------------------------\n")
print("Confusion Matrix:\n", conf_matrix1_rf)
print("\n-----------------------------------------Model evaluation with the cleaned_tweetdata ends (Random Forest)-----------------------------------------------\n")

------------------------------------------Model evaluation with the cleaned_tweetdata (Random Forest)----------------------------------------------------


-----------------------------------Accuracy---------------------------------------

Test Accuracy: 0.8592

----------------------Classification report---------------------------------------

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.87      0.80      0.83       283
    Negative       0.89      0.85      0.87       347
     Neutral       0.90      0.86      0.88       391
    Positive       0.80      0.90      0.85       463

    accuracy                           0.86      1484
   macro avg       0.87      0.85      0.86      1484
weighted avg       0.86      0.86      0.86      1484


-------------------------Confusion Matrix---------------------------------------

Confusion Matrix:
 [[226  10  11  36]
 [ 13 296  10  28]
 [ 10   7 335  39]
 [ 11  18  16 418]]

--------------

# **Naive Bayes (MultinomialNB)**

In [None]:
# Import the libraries
from sklearn.naive_bayes import MultinomialNB

# Train a Naive Bayes model (cleaned_tweetdata)
NaiveBayes_Tweet_model = MultinomialNB()
NaiveBayes_Tweet_model.fit(X_train1_tfidf, y_train1)

# Make predictions on the test set
y_test1_pred = NaiveBayes_Tweet_model.predict(X_test1_tfidf)

# Evaluate the model on the test set
accuracy1_nb = accuracy_score(y_test1, y_test1_pred)
report1_nb = classification_report(y_test1, y_test1_pred)
conf_matrix1_nb = confusion_matrix(y_test1, y_test1_pred)

# Printing the results for Naive Bayes
print("------------------------------------------Model evaluation with the cleaned_tweetdata (Naive Bayes)----------------------------------------------------\n")
print("\n-----------------------------------Accuracy---------------------------------------\n")
print(f"Test Accuracy: {accuracy1_nb:.4f}")
print("\n----------------------Classification report---------------------------------------\n")
print("Classification Report:\n", report1_nb)
print("\n-------------------------Confusion Matrix---------------------------------------\n")
print("Confusion Matrix:\n", conf_matrix1_nb)
print("\n-----------------------------------------Model evaluation with the cleaned_tweetdata ends (Naive Bayes)-----------------------------------------------\n")

------------------------------------------Model evaluation with the cleaned_tweetdata (Naive Bayes)----------------------------------------------------


-----------------------------------Accuracy---------------------------------------

Test Accuracy: 0.6361

----------------------Classification report---------------------------------------

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.73      0.35      0.47       283
    Negative       0.69      0.69      0.69       347
     Neutral       0.78      0.51      0.61       391
    Positive       0.55      0.88      0.67       463

    accuracy                           0.64      1484
   macro avg       0.68      0.61      0.61      1484
weighted avg       0.67      0.64      0.62      1484


-------------------------Confusion Matrix---------------------------------------

Confusion Matrix:
 [[ 99  37  25 122]
 [  7 240  14  86]
 [ 20  45 198 128]
 [ 10  28  18 407]]

----------------

# **Accuracy comparison of all models**

In [None]:
print("===============================Accuracy comparison========================================")
print("\n-----------------------------------------For Tweet data---------------------------------------------------\n")
print("Logistic Regression Accuracy (Tweet Data):", accuracy1_logistic)
print("SVM Accuracy (Tweet Data):", accuracy1_svm)
print("Random Forest Accuracy (Tweet Data):", accuracy1_rf)
print("Naive Bayes Accuracy (Tweet Data):", accuracy1_nb)


-----------------------------------------For Tweet data---------------------------------------------------

Logistic Regression Accuracy (Tweet Data): 0.6913746630727763
SVM Accuracy (Tweet Data): 0.7115902964959568
Random Forest Accuracy (Tweet Data): 0.8591644204851752
Naive Bayes Accuracy (Tweet Data): 0.6361185983827493


# **Accuracy in the % form**

In [None]:
# Original accuracy values
tweet_data_accuracies = {
    "Logistic Regression": 0.7725437415881561,
    "SVM":  0.8088829071332436,
    "Random Forest": 0.8909825033647375,
    "Naive Bayes":0.6985195154777928,
}

# Convert accuracies to percentages with 3 decimal places
tweet_data_accuracies_percent = {model: accuracy * 100 for model, accuracy in tweet_data_accuracies.items()}

# Display accuracies in percentage format
print("===============================Accuracy comparison========================================\n")
print("-----------------------------------------For Tweet data---------------------------------------------------\n")
for model, accuracy in tweet_data_accuracies_percent.items():
    print(f"{model} Accuracy (Tweet Data): {accuracy:.3f}%")

print("========================================================================================")



-----------------------------------------For Tweet data---------------------------------------------------

Logistic Regression Accuracy (Tweet Data): 77.254%
SVM Accuracy (Tweet Data): 80.888%
Random Forest Accuracy (Tweet Data): 89.098%
Naive Bayes Accuracy (Tweet Data): 69.852%


# **Conclusion**
based on the accuracy comparison of same dataset but different no of entries (0 to 100, 0 to 500, 0 to 1000, 0 to 5000, 0 to 10000, 0 to 50000 and 0 to 70000), it appears that for the Tweet dataset **Random Forest** is giving the highest accuracy with the range of **(0.84 - 0.92) or (84 - 92 %)** hence the RandomForest algorithm will use for the **Sentiment Analysis.**

# **Saving the model**

In [None]:
#import the libraries
import pickle

with open("RandomForest_Tweet_10000_model.model","wb") as f:
    pickle.dump(RandomForest_Tweet_model,f)

print("Model saved")

Model saved
