Sentiment Analysis

In [3]:
import  pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import resample

Dataset Loading And Prepocessing

In [4]:

def load_data(filepath1, filepath2):
  
    d1 = pd.read_csv(filepath1, encoding='latin1', names=['polarity', 'id', 'date', 'query', 'user', 'text'])
    d2 = pd.read_csv(filepath2, encoding='latin1', names=['polarity', 'id', 'date', 'query', 'user', 'text'])
    dataset = pd.concat([d1, d2], ignore_index=True)

    return dataset

def clean_data(dataset):
   
    dataset.dropna(inplace=True)
    dataset.drop_duplicates(inplace=True)
    columns_to_drop = ['id', 'date', 'query', 'user']
    dataset = dataset.drop(columns=columns_to_drop)


    def clean_text(text):
        text = re.sub(r'http\S+', '', text)  
        text = re.sub(r'@\w+', '', text)   
        text = re.sub(r'#\w+', '', text)   
        text = re.sub(r'[^A-Za-z0-9\s]', '', text) 
        text = text.lower() 
        text = text.strip()  
        return text

    dataset['text'] = dataset['text'].apply(clean_text)

    return dataset

def split_data(dataset, test_size=0.2, random_state=42):
   
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        dataset['text'].values, 
        dataset['polarity'].values, 
        test_size=test_size,  
        random_state=random_state,
        stratify=dataset['polarity']
    )

    return train_texts, test_texts, train_labels, test_labels


filepath1 = r"C:\Users\Vignesh\Downloads\sentiment_data2\testdata.manual.2009.06.14.csv"
filepath2 = r"C:\Users\Vignesh\Downloads\sentiment_data2\training.1600000.processed.noemoticon.csv"

dataset = load_data(filepath1, filepath2)


cleaned_dataset = clean_data(dataset)
df_class_0 = cleaned_dataset[cleaned_dataset['polarity'] == 0]
df_class_2 = cleaned_dataset[cleaned_dataset['polarity'] == 2]
df_class_4 = cleaned_dataset[cleaned_dataset['polarity'] == 4]
df_class_2_upsampled = resample(df_class_2, 
                                replace=True,     
                                n_samples=max(len(df_class_0), len(df_class_4)),  
                                random_state=42)
  
balanced_dataset = pd.concat([df_class_0, df_class_2_upsampled, df_class_4])
balanced_dataset = balanced_dataset.sample(frac=1, random_state=42).reset_index(drop=True)
print(balanced_dataset['polarity'].value_counts())
train_texts, test_texts, train_labels, test_labels = split_data(balanced_dataset)

print(len(train_texts), len(test_texts))
print(pd.Series(train_labels).value_counts())
print(pd.Series(test_labels).value_counts())


polarity
2    800182
4    800182
0    800177
Name: count, dtype: int64
1920432 480109
2    640145
4    640145
0    640142
Name: count, dtype: int64
4    160037
2    160037
0    160035
Name: count, dtype: int64


Traning And Evaluation 

In [5]:

def train_ml_model(train_texts, train_labels):
  
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(max_iter=1000, solver='liblinear'))
    ])
    pipeline.fit(train_texts, train_labels)
    
    return pipeline

def evaluate_ml_model(model, test_texts, test_labels):
    predicted_labels = model.predict(test_texts)
    accuracy = accuracy_score(test_labels, predicted_labels)
    report = classification_report(test_labels, predicted_labels)
    
    return accuracy, report
model = train_ml_model(train_texts, train_labels)


accuracy, report = evaluate_ml_model(model, test_texts, test_labels)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.8658637934302419
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.79      0.80    160035
           2       1.00      1.00      1.00    160037
           4       0.79      0.81      0.80    160037

    accuracy                           0.87    480109
   macro avg       0.87      0.87      0.87    480109
weighted avg       0.87      0.87      0.87    480109

