# TF-IDF-Logistic Regression

In [1]:
!pip install -q nltk scikit-learn pandas matplotlib seaborn wordcloud
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/Practice/NLP/chatgpt.csv')
df.head

Mounted at /content/drive


In [3]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219294 entries, 0 to 219293
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  219294 non-null  int64 
 1   tweets      219294 non-null  object
 2   labels      219294 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.0+ MB
None


Unnamed: 0.1,Unnamed: 0,tweets,labels
0,0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,1,"Try talking with ChatGPT, our new AI system wh...",good
2,2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,3,"THRILLED to share that ChatGPT, our new model ...",good
4,4,"As of 2 minutes ago, @OpenAI released their ne...",bad


Clean and Preprocess the Data

In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [5]:
df = df.drop(columns = ['Unnamed: 0'])

In [6]:
stop_words = set(stopwords.words('english'))

In [7]:
def clean_text (text):
  text = text.lower()
  text = text.translate(str.maketrans('', '', string.punctuation))
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if word not in stop_words]
  return ' '.join(tokens)


In [8]:
df['cleaned_tweets'] = df['tweets'].astype(str).apply(clean_text)

In [9]:
df[['tweets', 'cleaned_tweets', 'labels']].head()

Unnamed: 0,tweets,cleaned_tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,chatgpt optimizing language models dialogue ht...,neutral
1,"Try talking with ChatGPT, our new AI system wh...",try talking chatgpt new ai system optimized di...,good
2,ChatGPT: Optimizing Language Models for Dialog...,chatgpt optimizing language models dialogue ht...,neutral
3,"THRILLED to share that ChatGPT, our new model ...",thrilled share chatgpt new model optimized dia...,good
4,"As of 2 minutes ago, @OpenAI released their ne...",2 minutes ago openai released new chatgpt nnan...,bad


Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
label_map = {'bad': 0, 'neutral':1, 'good':2}
df['label_encoded'] = df['labels'].map(label_map)

In [12]:
df[['tweets', 'cleaned_tweets', 'labels', 'label_encoded']].head()

Unnamed: 0,tweets,cleaned_tweets,labels,label_encoded
0,ChatGPT: Optimizing Language Models for Dialog...,chatgpt optimizing language models dialogue ht...,neutral,1
1,"Try talking with ChatGPT, our new AI system wh...",try talking chatgpt new ai system optimized di...,good,2
2,ChatGPT: Optimizing Language Models for Dialog...,chatgpt optimizing language models dialogue ht...,neutral,1
3,"THRILLED to share that ChatGPT, our new model ...",thrilled share chatgpt new model optimized dia...,good,2
4,"As of 2 minutes ago, @OpenAI released their ne...",2 minutes ago openai released new chatgpt nnan...,bad,0


Train Test Split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = df['cleaned_tweets']
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

Vectorize Text (TF-IDF)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

Train Baseline Classifier (Logistic Regression)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [18]:
clf = LogisticRegression(max_iter = 200)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names = ['bad', 'neutral', 'good']))
print('Accuracy:', accuracy_score(y_test, y_pred))

[[20344   932   283]
 [ 2394  7187  1517]
 [  617  1691  8894]]
              precision    recall  f1-score   support

         bad       0.87      0.94      0.91     21559
     neutral       0.73      0.65      0.69     11098
        good       0.83      0.79      0.81     11202

    accuracy                           0.83     43859
   macro avg       0.81      0.80      0.80     43859
weighted avg       0.83      0.83      0.83     43859

Accuracy: 0.8305022914339132
