In [12]:
import nltk
import os
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
import re
from sklearn.model_selection import train_test_split

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the dataset
df = pd.read_csv('train.csv', delimiter=',', encoding='ISO-8859-1')
df


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kivsithvothy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kivsithvothy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kivsithvothy/nltk_data...


Unnamed: 0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1048567,4,1960186342,Fri May 29 07:33:44 PDT 2009,NO_QUERY,Madelinedugganx,My GrandMa is making Dinenr with my Mum
1048568,4,1960186409,Fri May 29 07:33:43 PDT 2009,NO_QUERY,OffRoad_Dude,Mid-morning snack time... A bowl of cheese noo...
1048569,4,1960186429,Fri May 29 07:33:44 PDT 2009,NO_QUERY,Falchion,@ShaDeLa same here say it like from the Termi...
1048570,4,1960186445,Fri May 29 07:33:44 PDT 2009,NO_QUERY,jonasobsessedx,@DestinyHope92 im great thaanks wbuu?


In [13]:
df.columns = ['Sentiment','id','date','query','user','text']
df = df[['Sentiment','text']]

In [14]:
df

Unnamed: 0,Sentiment,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1048567,4,My GrandMa is making Dinenr with my Mum
1048568,4,Mid-morning snack time... A bowl of cheese noo...
1048569,4,@ShaDeLa same here say it like from the Termi...
1048570,4,@DestinyHope92 im great thaanks wbuu?


In [15]:
# Replace value 4 with 1 for positive sentiment
df['Sentiment'] = df['Sentiment'].replace({4:1})

# Downsample the majority class to balance the dataset
df_majority = df[df['Sentiment']==0]
df_minority = df[df['Sentiment']==1]
df_majority_downsampled = df_majority.sample(n=len(df_minority), random_state=42)
df_balanced = pd.concat([df_majority_downsampled, df_minority])

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment'] = df['Sentiment'].replace({4:1})


Unnamed: 0,Sentiment,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1048567,1,My GrandMa is making Dinenr with my Mum
1048568,1,Mid-morning snack time... A bowl of cheese noo...
1048569,1,@ShaDeLa same here say it like from the Termi...
1048570,1,@DestinyHope92 im great thaanks wbuu?


In [16]:
# Preprocess the text data
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    # Original: running, Lemmatized: running
    # Original: ate, Lemmatized: ate
    # Original: dogs, Lemmatized: dog
    # Original: better, Lemmatized: better
    # Original: rocks, Lemmatized: rock
    text = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]  # Lemmatization
    # For stop_words
    # Original text: This is an example sentence to demonstrate removing stopwords.
    # Filtered text: example sentence demonstrate removing stopwords.
    return ' '.join(text)

df_balanced['text_cleaned'] = df_balanced['text'].apply(preprocess_text)

In [18]:
df_balanced['text_cleaned']

212188     amruth92 early bird didnt reply facebook wall ...
674330     sooo hot tonight wish pool id outside skinny d...
752234                            darkandrez washing clothes
415739                                        asexiness suck
138859     havent watch yet finale ai ive waiting replay ...
                                 ...                        
1048567                            grandma making dinenr mum
1048568         midmorning snack time bowl cheese noodle yum
1048569    shadela say like terminiator movie come like 3...
1048570                  destinyhope92 im great thaanks wbuu
1048571                           cant wait til date weekend
Name: text_cleaned, Length: 497152, dtype: object

In [17]:
df

Unnamed: 0,Sentiment,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1048567,1,My GrandMa is making Dinenr with my Mum
1048568,1,Mid-morning snack time... A bowl of cheese noo...
1048569,1,@ShaDeLa same here say it like from the Termi...
1048570,1,@DestinyHope92 im great thaanks wbuu?


In [19]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_balanced['text_cleaned'], df_balanced['Sentiment'], test_size=0.2, random_state=42)
X_train

889926            finally uploading crappy britney spear pic
420864             flying back dubai 6pm philippine time sad
86095      faeorie nah dunno viv green think neighbourhoo...
926275     mcds kid mom nap think im getting better end m...
1017241    fruitty pebble organic milk really cheer espec...
                                 ...                        
810598                       another amusing tweeter tinafey
917258                         michaelmcrowley saw wolverine
274021                                          losing voice
60643      settled watch football realised dont get setan...
395703         lookin french article oral exam hope graduate
Name: text_cleaned, Length: 397721, dtype: object

In [20]:
# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [21]:
# Train a logistic regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [22]:
# Evaluate the model
train_accuracy = lr_model.score(X_train_tfidf, y_train)
test_accuracy = lr_model.score(X_test_tfidf, y_test)
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

Training Accuracy: 0.78
Testing Accuracy: 0.77


### The classification report of a logistic regression classifier using the classification_report()

In [33]:
from sklearn.metrics import classification_report
# Predict the target values
y_pred = lr_model.predict(X_test_tfidf)

# Display the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.76      0.77     49666
           1       0.77      0.79      0.78     49765

    accuracy                           0.77     99431
   macro avg       0.77      0.77      0.77     99431
weighted avg       0.77      0.77      0.77     99431



### Accuracy for training and testing ratio 

In [23]:
# Create a dataframe with the accuracy scores
accuracy_scores = pd.DataFrame({
    "Accuracy": [train_accuracy, test_accuracy],
    "Dataset": ["Training set", "Test set"]
})

# Print the dataframe
print(accuracy_scores)

   Accuracy       Dataset
0  0.781125  Training set
1  0.773803      Test set


### Display the split percentage of training and test set as a table

In [26]:
# Create a dataframe with the split percentages
split_percentages = pd.DataFrame({
    "Dataset": ["Training set", "Test set"],
    "Percentage": [100 - 20, 20]
})

# Display the dataframe
print(split_percentages)

        Dataset  Percentage
0  Training set          80
1      Test set          20


In [31]:
data = pd.read_csv('train.csv', delimiter=',', encoding='ISO-8859-1')
print(data.corr())


                    polarity of tweet   id of the tweet
polarity of tweet             1.000000        -0.571528
id of the tweet              -0.571528         1.000000


In [10]:
# Save the model and TF-IDF vectorizer to disk
joblib.dump(lr_model, 'sentiment_analysis_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']