# **IMPORTING LIBRARIES**

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import re


# **LOADING KAGGLE DATASET**

In [2]:
#Installing Kaggle
!pip install kaggle



In [6]:
#Configuring path of kaggle .json file

import shutil
import os

# Create the .kaggle directory if it doesn't exist
os.makedirs("/root/.kaggle", exist_ok=True)

# Move the kaggle(1).json file to the .kaggle directory and rename it to kaggle.json
shutil.move("/content/kaggle.json", "/root/.kaggle/kaggle.json")

# Set permissions to the file
!chmod 600 /root/.kaggle/kaggle.json

In [7]:
!kaggle datasets download  -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 98% 79.0M/80.9M [00:02<00:00, 42.7MB/s]
100% 80.9M/80.9M [00:02<00:00, 31.8MB/s]


In [8]:
#Extracting the compressed zipfile

from zipfile import ZipFile
dataset='/content/sentiment140.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print("The dataset is extracted")

The dataset is extracted


# **UNDERSTANDING DATA**

In [9]:
twitter_data=pd.read_csv('/content/training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1')

In [10]:
#Name the columns based on kaggle dataset
column_names=["target","id","date","flag","user","text"]
twitter_data=pd.read_csv('/content/training.1600000.processed.noemoticon.csv',names=column_names, encoding='ISO-8859-1')

In [11]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [12]:
# Filter out only the necessary columns
data = twitter_data[['target', 'text']]
data['target'] = data['target'].map({0: 0, 4: 1})  # Map 0 to 0 and 4 to 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['target'] = data['target'].map({0: 0, 4: 1})  # Map 0 to 0 and 4 to 1


# **CLEANING TEXT DATA AND PERFORM STEMMING**

In [13]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
# Function to clean the text
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove '#' symbols
    text = re.sub(r'RT\s+', '', text)  # Remove RTs
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)  # Remove hyperlinks
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lower case
    return text

# Clean the text
data['text'] = data['text'].apply(clean_text)

# Remove stop words and perform stemming
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

data['text'] = data['text'].apply(preprocess_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(preprocess_text)


# **TRAIN-TEST SPLIT**

In [15]:
X = data['text']
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# **FEATURE EXTRACTION**

In [16]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


# **LOGISTIC REGRESSION MODEL**

In [17]:
model = LogisticRegression(max_iter=1000)  # Increase max_iter if necessary
model.fit(X_train_vectorized, y_train)


# **MODEL EVALUATION**

In [18]:
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_pred))


Accuracy: 77.27%
              precision    recall  f1-score   support

           0       0.78      0.75      0.77    159494
           1       0.76      0.80      0.78    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



# **MULTINOMIAL NAIVE BAYES MODEL**

In [19]:
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)


# **MODEL EVALUATION**

In [20]:
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_pred))


Accuracy: 76.27%
              precision    recall  f1-score   support

           0       0.75      0.78      0.77    159494
           1       0.77      0.75      0.76    160506

    accuracy                           0.76    320000
   macro avg       0.76      0.76      0.76    320000
weighted avg       0.76      0.76      0.76    320000

