#NLP
# logistic regression

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")

print("Path to dataset files:", path)


  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/kazanova/sentiment140?dataset_version_number=2...


100%|██████████| 80.9M/80.9M [00:06<00:00, 12.2MB/s]

Extracting files...





Path to dataset files: C:\Users\SudhirYadav\.cache\kagglehub\datasets\kazanova\sentiment140\versions\2


In [None]:
import pandas as pd
# Load the dataset
df = pd.read_csv(path + "/training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None)


In [5]:
import numpy as np
import re

# natural language processing
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# machine learning sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SudhirYadav\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [None]:
# stopwords are common words that do not contribute much to the meaning of a sentence
# e.g., "the", "is", "in", "and" 
# We can remove these from our text data to improve model performance
print(stopwords.words('english'))


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [None]:
# data preprocessing
twitter_data = df
twitter_data.shape


(1600000, 6)

In [10]:
# renaming the header of the dataset
twitter_data.columns = ['target', 'id', 'date', 'flag', 'user', 'text']


In [None]:
twitter_data.head()


Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
# Checking for null values in the dataset 
twitter_data.isnull().sum()


target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [None]:
# checking the distribution of the target variable
twitter_data['target'].value_counts()


target
0    800000
4    800000
Name: count, dtype: int64

In [None]:
# converting the target variable to binary 1 and 0
#twitter_data['target'] = twitter_data['target'].map({4: 1})
twitter_data.replace({4: 1, 0: 0}, inplace=True)


In [17]:
twitter_data['target'].value_counts()


target
1.0    800000
Name: count, dtype: int64

In [18]:
# Streamlining the text data 
# streamlining means reducing the text to its base form
# e.g., "running" becomes "run", "better" becomes "good" actor, actress, actress becomes "act"
port_stem = PorterStemmer()


In [19]:
def streamlining_text(content):
    stemmed_content = re.sub('[^A-Za-z]',' ', content)  # remove @mentions
    stemmed_content= stemmed_content.lower()  # convert to lowercase
    stemmed_content = stemmed_content.split()  # split into words
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in set(stopwords.words('english'))]  # remove stopwords
    stemmed_content = ' '.join(stemmed_content)  # join words back into a string
    return stemmed_content


In [20]:
twitter_data['streamlined_content'] = twitter_data['text'].apply(streamlining_text)


KeyboardInterrupt: 

In [None]:
twitter_data['streamlined_content'].head()


In [None]:
# splitting the dataset into training and testing sets
X = twitter_data['streamlined_content'].values
Y = twitter_data['target'].values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,stratify=Y, random_state=2)


In [None]:
# converting text data to numerical data using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)


In [None]:
# Training the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [None]:
# Model Evaluation
# accuracy on training data
y_train_pred = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train, y_train_pred)


In [None]:
# accuracy on test data
y_test_pred = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test, y_test_pred)


In [None]:
# packing the model
import pickle
pickle.dump(model, open('logistic_regression_model.pkl', 'wb'))
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))


In [None]:
# usibg the model to make predictions
loaded_model = pickle.load(open('logistic_regression_model.pkl', 'rb'))
loaded_vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))
# Example usage of the loaded model
example_text = "I love this product!"
example_text_vectorized = loaded_vectorizer.transform([example_text])
prediction = loaded_model.predict(example_text_vectorized)  
