<a href="https://colab.research.google.com/github/Nwanekwup/Sentiment_Analysis_Project/blob/main/Sentiment_Analysis_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
pd.set_option("display.max_colwidth", 200)
import matplotlib.pyplot as plt
import re

In [3]:
# define file path and load dataset
file_path = "/content/drive/My Drive/Sentiment Analysis Project/tweets.csv"
data = pd.read_csv(file_path, header=0)

data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [4]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression


In [6]:
# Data preprocessing phase
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [7]:

def preprocess_text(text):
  # ensure text is a string
  text = str(text)

  # clean text using regular expression

  text = re.sub(r'https?://\S+|www.\.\S+', '', text) #remove URLs
  text = re.sub(r'@\w+', '', text) # Remove user mentions
  text = re.sub(r'#', '', text) # Remove hashtag symbol
  text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
  text = text.lower() # Convert to lowercase

  # Tokenize
  tokens = word_tokenize(text)

  #Lemmatize and remove stop words
  processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
  return ' '.join(processed_tokens)



In [8]:
# Apply the preprocessing function to the 'tweet' column
data['processed_tweet'] = data['tweet'].apply(preprocess_text)
print("\nPreprocessing complete. Here's a sample of the original vs. processet tweets:")
print(data[['tweet', 'processed_tweet']].head())


Preprocessing complete. Here's a sample of the original vs. processet tweets:
                                                                                                                                 tweet  \
0     #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone   
1  Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/   
2          We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu   
3                     I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/   
4         What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!   

                                                                                   processed

Feature engineering: Split preprocessed data into training and testing sets and then convert the text into two numerical features using Bag-of-Words model

In [13]:
# Define features (X) and Target (y) using column names
X = data['processed_tweet']
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Execute Feature Engineering with CountVectorizer
# This creates the document-term matrix
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


print(f"Shape of training data features: {X_train_vec.shape}")


Shape of training data features: (6336, 14401)


Train Baseline Model: train baseline Logistic Regression classifier on the numerical features created

In [10]:
# Initialize and train the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Use the trained model to make predictions on the test data features
y_pred = model.predict(X_test_vec)

# Print the first 20 predictions to see the raw output
print("First 20 predictions from the model:", y_pred[:20])

# Calculate the key performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

# Print the full classification report for a more detailed breakdown
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))




First 20 predictions from the model: [0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0]
Accuracy: 0.8908
Precision: 0.8346
Recall: 0.7477

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.91      0.94      0.93      1152
           1       0.83      0.75      0.79       432

    accuracy                           0.89      1584
   macro avg       0.87      0.85      0.86      1584
weighted avg       0.89      0.89      0.89      1584



Refining strategy with TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

#initialize the new TF_IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# fit and transform the training data. This teaches the vectorizer the vocabulary and creates the feature matrix
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform test data only. uses the already-learned vocabulary to create the feature matrix for the test set
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"Shape of training data features: {X_train_tfidf.shape}")


Shape of training data features: (6336, 14401)


In [14]:
# Initialize a new Logistic Regression model
tfidf_model = LogisticRegression(max_iter=1000)

# Train model on new TF-IDF features
tfidf_model.fit(X_train_tfidf, y_train)

# make predictions on the new TF-IDF test features
y_pred_tfidf = tfidf_model.predict(X_test_tfidf)

# Evaluate the perforance of the refined model
print("--- Classification Report (TF-IDF) ---")
print(classification_report(y_test, y_pred_tfidf))


--- Classification Report (TF-IDF) ---
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1152
           1       0.86      0.64      0.73       432

    accuracy                           0.87      1584
   macro avg       0.87      0.80      0.83      1584
weighted avg       0.87      0.87      0.87      1584

