In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


# Data Load

In [2]:
!kaggle datasets download -d kazanova/sentiment140


Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


# Data Extraction

In [3]:
from zipfile import ZipFile
file_name = "sentiment140.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print("Done")

Done


Import Dependencies

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, f1_score, classification_report
from bs4 import BeautifulSoup
import plotly.graph_objects as go
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data Cleaning

In [5]:
tweets = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None)
col_name = ['target', 'id', 'date', 'flag', 'user', 'tweet']
tweets.columns = col_name
tweets.head()

Unnamed: 0,target,id,date,flag,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
tweets.replace({4:1}, inplace=True)

In [7]:
tweets['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

# Data Preprocessing

In [8]:
def process_tweet(tweet):
  tweet = BeautifulSoup(tweet, 'html.parser').get_text()                        #remove html tags
  tweet = re.sub(r'http\S+', '', tweet)                                         #remove urls
  tweet = re.sub(r'[^a-zA-Z0-9\s]', '', tweet)                                  #remove special characters
  tweet = re.sub(r'[^\w\s]', '', tweet)                                         #remove punctuation
  tweet = re.sub(r'\d+', ' ', tweet)                                            #remove numbers
  tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)                                   #remove non-ascii characters
  tweet = re.sub(r'\s+', ' ', tweet)                                            #remove extra spaces
  tweet = tweet.lower()                                                         #convert to lowercase

  # Snowball Stemmer
  stemmer = SnowballStemmer('english')                                          # choose language
  tweet = tweet.split()                                                         # split into words
  tweet = [stemmer.stem(word) for word in tweet]                                # stem each word
  tweet = ' '.join(tweet)                                                       # join words back into string
  stop_words = set(stopwords.words('english'))
  tweet = ' '.join([word for word in tweet.split() if word not in stop_words])  # remove stopword

  return tweet

In [9]:
# Test Run
print(tweets['tweet'].iloc[30])
print(process_tweet(tweets['tweet'].iloc[30]))

some1 hacked my account on aim  now i have to make a new one
hack account aim make new one


In [10]:
tweets['Stemmed'] = tweets['tweet'].apply(process_tweet)


  tweet = BeautifulSoup(tweet, 'html.parser').get_text()                        #remove html tags


In [11]:
def evaluate_and_plot(models, X_test, y_test):
    metrics = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': []}

    for name, pipeline in models:
        y_pred = pipeline.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        metrics['Model'].append(name)
        metrics['Accuracy'].append(accuracy_score(y_test, y_pred))
        metrics['Precision'].append(report['1']['precision'])
        metrics['Recall'].append(report['1']['recall'])
        metrics['F1-Score'].append(report['1']['f1-score'])

    metrics_df = pd.DataFrame(metrics)

    # Plotting
    fig = go.Figure()

    for metric in ['Accuracy', 'Precision', 'Recall', 'F1-Score']:
        fig.add_trace(go.Bar(
            x=metrics_df['Model'],
            y=metrics_df[metric],
            name=metric
        ))

    fig.update_layout(
        title='Comparison of Model Metrics',
        xaxis_title='Model',
        yaxis_title='Score',
        barmode='group'
    )

    fig.show()


# Vectorisation


In [12]:
X_train, X_test, y_train, y_test = train_test_split(tweets['Stemmed'], tweets['target'], test_size=0.2, stratify=tweets['target'], random_state=54)


In [13]:
# Define pipelines for each technique

tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression(max_iter=1000, solver='saga'))
])

ngram_pipeline = Pipeline([
    ('ngram', CountVectorizer(ngram_range=(2, 3))),
    ('classifier', LogisticRegression(max_iter=1000, solver='saga'))
])

bow_pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('classifier', LogisticRegression(max_iter=1000, solver='saga'))
])

combined_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tfidf', TfidfVectorizer()),
        ('ngram', CountVectorizer(ngram_range=(2, 3))),
        ('bow', CountVectorizer())
    ])),
    ('classifier', LogisticRegression(max_iter=1000, solver='saga'))
])

In [14]:
pipelines = [
    ('TF-IDF', tfidf_pipeline),
    ('N-Gram', ngram_pipeline),
    ('BoW', bow_pipeline),
    ('Combined', combined_pipeline)
]

for name, pipeline in pipelines:
    pipeline.fit(X_train, y_train)



In [15]:
evaluate_and_plot(pipelines, X_test, y_test)