In [28]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [32]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\16088\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\16088\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
column_names = ["ID", "Category", "Sentiment", "Text"]
data = pd.read_csv("twitter_training.csv",names=column_names)

In [13]:
#print("\nMissing values in the dataset:\n", data.isnull().sum())

In [12]:
# Handle missing values in 'Text' column (Option 1: Fill with empty string)
#data['Text'].fillna("", inplace=True)
# Remove rows with missing values in the 'Text' column
#data = data.dropna(subset=['Text'])

In [5]:
data.head()

Unnamed: 0,ID,Category,Sentiment,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [11]:
#print("\nMissing values in the dataset:\n", data.isnull().sum())

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         74682 non-null  int64 
 1   Category   74682 non-null  object
 2   Sentiment  74682 non-null  object
 3   Text       73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [8]:
data.dtypes

ID            int64
Category     object
Sentiment    object
Text         object
dtype: object

In [10]:
# Define a function to clean the text
def clean_text(text):
    # Ensure the input is a string
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user mentions and hashtags
    text = re.sub(r'\@\w+|\#', '', text)
    
    # Remove non-alphanumeric characters except spaces
    text = re.sub(r'[^A-Za-z0-9 ]+', ' ', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    
    # Join tokens back to a single string
    return ' '.join(tokens)

# Apply the cleaning function to the 'Text' column
data['Cleaned_Text'] = data['Text'].apply(clean_text)

# Display cleaned data
print(data[['Text', 'Cleaned_Text']].head())


                                                Text  \
0  im getting on borderlands and i will murder yo...   
1  I am coming to the borders and I will kill you...   
2  im getting on borderlands and i will kill you ...   
3  im coming on borderlands and i will murder you...   
4  im getting on borderlands 2 and i will murder ...   

                      Cleaned_Text  
0    im getting borderlands murder  
1              coming borders kill  
2      im getting borderlands kill  
3     im coming borderlands murder  
4  im getting borderlands 2 murder  


In [14]:
data.shape

(74682, 5)

In [15]:

# Encode target labels
le = LabelEncoder()
data['Encoded_Sentiment'] = le.fit_transform(data['Sentiment'])

# Display cleaned data
print(data[['Text', 'Cleaned_Text', 'Sentiment', 'Encoded_Sentiment']].head())

                                                Text  \
0  im getting on borderlands and i will murder yo...   
1  I am coming to the borders and I will kill you...   
2  im getting on borderlands and i will kill you ...   
3  im coming on borderlands and i will murder you...   
4  im getting on borderlands 2 and i will murder ...   

                      Cleaned_Text Sentiment  Encoded_Sentiment  
0    im getting borderlands murder  Positive                  3  
1              coming borders kill  Positive                  3  
2      im getting borderlands kill  Positive                  3  
3     im coming borderlands murder  Positive                  3  
4  im getting borderlands 2 murder  Positive                  3  


In [16]:
data.head()

Unnamed: 0,ID,Category,Sentiment,Text,Cleaned_Text,Encoded_Sentiment
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im getting borderlands murder,3
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,coming borders kill,3
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting borderlands kill,3
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderlands murder,3
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder,3


In [22]:
X = data["Cleaned_Text"]
y = data["Encoded_Sentiment"]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=53
)

count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)

In [27]:
# Display shapes of train and test sets to confirm
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (50036,)
Testing set shape: (24646,)


In [40]:
alpha =  [0.1, 0.5, 1.0, 1.5, 2.0]
for i in alpha:
    
    nb_classifier = MultinomialNB(alpha=i)
    
    nb_classifier.fit(count_train, y_train)
    pred = nb_classifier.predict(count_test)
    print("Accuracy Score : ",metrics.accuracy_score(y_test, pred)*100)

Accuracy Score :  77.58256917958289
Accuracy Score :  75.22518867158972
Accuracy Score :  73.33441532094457
Accuracy Score :  71.81287024263571
Accuracy Score :  70.57128945873569


In [35]:
# Vectorize the text data
for i in range(1,5):
    
    for x in range(1,5):
        if x>=i:
            
            bow_counts = CountVectorizer(tokenizer=word_tokenize, stop_words=stop_words, ngram_range=(i, x))
            X_train_bow = bow_counts.fit_transform(X_train)
            X_test_bow = bow_counts.transform(X_test)
            
            # Train Logistic Regression model
            model = LogisticRegression(C=1, solver="liblinear", max_iter=200, class_weight='balanced')
            model.fit(X_train_bow, y_train)
            
            # Make predictions
            y_pred = model.predict(X_test_bow)
            
            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            print("Accuracy:", accuracy * 100)
            print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))




Accuracy: 80.90968108415159

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.81      0.75      0.78      4270
    Negative       0.84      0.83      0.84      7400
     Neutral       0.83      0.78      0.80      6077
    Positive       0.77      0.85      0.81      6899

    accuracy                           0.81     24646
   macro avg       0.81      0.80      0.81     24646
weighted avg       0.81      0.81      0.81     24646





Accuracy: 88.55392355757526

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.92      0.84      0.88      4270
    Negative       0.91      0.90      0.90      7400
     Neutral       0.90      0.87      0.88      6077
    Positive       0.84      0.92      0.88      6899

    accuracy                           0.89     24646
   macro avg       0.89      0.88      0.88     24646
weighted avg       0.89      0.89      0.89     24646





Accuracy: 88.47277448673213

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.92      0.82      0.87      4270
    Negative       0.90      0.90      0.90      7400
     Neutral       0.90      0.87      0.88      6077
    Positive       0.83      0.92      0.87      6899

    accuracy                           0.88     24646
   macro avg       0.89      0.88      0.88     24646
weighted avg       0.89      0.88      0.88     24646

Accuracy: 87.60042197516839

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.92      0.80      0.86      4270
    Negative       0.89      0.90      0.90      7400
     Neutral       0.89      0.86      0.87      6077
    Positive       0.83      0.91      0.87      6899

    accuracy                           0.88     24646
   macro avg       0.88      0.87      0.87     24646
weighted avg       0.88      0.88      0.88     24646





Accuracy: 88.47277448673213

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.97      0.83      0.90      4270
    Negative       0.96      0.86      0.90      7400
     Neutral       0.95      0.87      0.91      6077
    Positive       0.75      0.96      0.85      6899

    accuracy                           0.88     24646
   macro avg       0.91      0.88      0.89     24646
weighted avg       0.90      0.88      0.89     24646





Accuracy: 88.29830398441938

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.98      0.82      0.89      4270
    Negative       0.96      0.86      0.91      7400
     Neutral       0.96      0.86      0.91      6077
    Positive       0.74      0.97      0.84      6899

    accuracy                           0.88     24646
   macro avg       0.91      0.88      0.89     24646
weighted avg       0.90      0.88      0.89     24646





Accuracy: 87.79923719873408

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.98      0.81      0.88      4270
    Negative       0.96      0.86      0.90      7400
     Neutral       0.96      0.85      0.90      6077
    Positive       0.73      0.97      0.84      6899

    accuracy                           0.88     24646
   macro avg       0.91      0.87      0.88     24646
weighted avg       0.90      0.88      0.88     24646





Accuracy: 82.63004138602614

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.99      0.74      0.85      4270
    Negative       0.98      0.75      0.85      7400
     Neutral       0.96      0.79      0.87      6077
    Positive       0.63      0.99      0.77      6899

    accuracy                           0.83     24646
   macro avg       0.89      0.82      0.84     24646
weighted avg       0.88      0.83      0.83     24646





Accuracy: 81.31948389190944

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       1.00      0.72      0.83      4270
    Negative       0.98      0.75      0.85      7400
     Neutral       0.96      0.76      0.85      6077
    Positive       0.61      0.99      0.75      6899

    accuracy                           0.81     24646
   macro avg       0.89      0.80      0.82     24646
weighted avg       0.88      0.81      0.82     24646

Accuracy: 74.15402093646027

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       1.00      0.62      0.77      4270
    Negative       0.99      0.63      0.77      7400
     Neutral       0.97      0.67      0.79      6077
    Positive       0.52      0.99      0.69      6899

    accuracy                           0.74     24646
   macro avg       0.87      0.73      0.76     24646
weighted avg       0.86      0.74      0.75     24646



In [38]:
bow_counts = CountVectorizer(tokenizer=word_tokenize, stop_words=stop_words, ngram_range=(1, 2))
X_train_bow = bow_counts.fit_transform(X_train)
X_test_bow = bow_counts.transform(X_test)
            
# Train Logistic Regression model
model = LogisticRegression(C=1, solver="liblinear", max_iter=200, class_weight='balanced')
model.fit(X_train_bow, y_train)
            
# Make predictions
y_pred = model.predict(X_test_bow)
            
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy * 100)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))




Accuracy: 88.55392355757526

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.92      0.84      0.88      4270
    Negative       0.91      0.90      0.90      7400
     Neutral       0.90      0.87      0.88      6077
    Positive       0.84      0.92      0.88      6899

    accuracy                           0.89     24646
   macro avg       0.89      0.88      0.88     24646
weighted avg       0.89      0.89      0.89     24646





In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000, stop_words=stop_words)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Set up parameter grid for Logistic Regression
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [200, 500]
}

# Initialize the Logistic Regression model
logistic_model = LogisticRegression(class_weight='balanced')

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

# Best model from Grid Search
best_logistic_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_logistic_model.predict(X_test_tfidf)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print("Best Model Accuracy:", accuracy * 100)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


Best Model Accuracy: 77.86659092753389

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.73      0.72      0.73      4270
    Negative       0.84      0.80      0.82      7400
     Neutral       0.77      0.77      0.77      6077
    Positive       0.76      0.81      0.78      6899

    accuracy                           0.78     24646
   macro avg       0.77      0.77      0.77     24646
weighted avg       0.78      0.78      0.78     24646

