#### Importing libraries and removing duplicates and null values

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('file.csv')

In [3]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
df.duplicated().sum()

1671

In [5]:
df = df.drop_duplicates(keep='first')
df.duplicated().sum()

0

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 217623 entries, 0 to 219293
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tweets  217623 non-null  object
 1   labels  217623 non-null  object
dtypes: object(2)
memory usage: 5.0+ MB


In [7]:
df.isna().sum()

tweets    0
labels    0
dtype: int64

#### Cleaning the data up

In [8]:
df = df.reset_index().drop('index', axis=1)

In [9]:
df.labels.unique(), df.labels.value_counts()

(array(['neutral', 'good', 'bad'], dtype=object),
 labels
 bad        106695
 good        55754
 neutral     55174
 Name: count, dtype: int64)

In [10]:
df.labels = df.labels.astype('category').cat.codes

In [11]:
df.tweets.str.contains('https://').sum()

145971

In [12]:
df

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,2
1,"Try talking with ChatGPT, our new AI system wh...",1
2,ChatGPT: Optimizing Language Models for Dialog...,2
3,"THRILLED to share that ChatGPT, our new model ...",1
4,"As of 2 minutes ago, @OpenAI released their ne...",0
...,...,...
217618,Other Software Projects Are Now Trying to Repl...,0
217619,I asked #ChatGPT to write a #NYE Joke for SEOs...,1
217620,chatgpt is being disassembled until it can onl...,0
217621,2023 predictions by #chatGPT. Nothing really s...,0


In [13]:
df[['tweets', 'links']] = df['tweets'].str.split('https://', expand=True, n=1)

In [14]:
df.drop('links', axis=1, inplace=True)

In [15]:
df

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialogue,2
1,"Try talking with ChatGPT, our new AI system wh...",1
2,ChatGPT: Optimizing Language Models for Dialogue,2
3,"THRILLED to share that ChatGPT, our new model ...",1
4,"As of 2 minutes ago, @OpenAI released their ne...",0
...,...,...
217618,Other Software Projects Are Now Trying to Repl...,0
217619,I asked #ChatGPT to write a #NYE Joke for SEOs...,1
217620,chatgpt is being disassembled until it can onl...,0
217621,2023 predictions by #chatGPT. Nothing really s...,0


In [16]:
df['tweets'] = df['tweets'].str.replace(r'\\n\\n|\\n', '', regex=True)

In [17]:
df['tweets'] = df.tweets.str.replace(r'[\n\r]', '', regex=True)

In [18]:
import re

In [19]:
df.tweets = df.tweets.apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

In [20]:
df.tweets = df.tweets.apply(lambda x: x.lower())

In [21]:
df.tweets

0          chatgpt optimizing language models for dialogue 
1         try talking with chatgpt our new ai system whi...
2          chatgpt optimizing language models for dialogue 
3         thrilled to share that chatgpt our new model o...
4         as of 2 minutes ago openai released their new ...
                                ...                        
217618    other software projects are now trying to repl...
217619    i asked chatgpt to write a nye joke for seos a...
217620    chatgpt is being disassembled until it can onl...
217621    2023 predictions by chatgpt nothing really spe...
217622                             from chatgpt neat stuff 
Name: tweets, Length: 217623, dtype: object

#### Removing stopwords, tokenizing and filtering text column

In [22]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [23]:
stop_words = set(stopwords.words('english'))

In [24]:
df.tweets = df.tweets.apply(lambda x: word_tokenize(x))

In [25]:
df.tweets = df.tweets.apply(lambda x: [word for word in x if word.lower() not in stop_words])

In [26]:
df.tweets = df.tweets.apply(lambda x: ' '.join(x))

In [27]:
df

Unnamed: 0,tweets,labels
0,chatgpt optimizing language models dialogue,2
1,try talking chatgpt new ai system optimized di...,1
2,chatgpt optimizing language models dialogue,2
3,thrilled share chatgpt new model optimized dia...,1
4,2 minutes ago openai released new chatgpt use ...,0
...,...,...
217618,software projects trying replicate chatgpt,0
217619,asked chatgpt write nye joke seos delivered se...,1
217620,chatgpt disassembled dissemble,0
217621,2023 predictions chatgpt nothing really specif...,0


#### Model development

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
x_train, x_test, y_train, y_test = train_test_split(df.tweets, df.labels, test_size=0.2, random_state=0)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
vect = TfidfVectorizer()

In [32]:
x_train_v = vect.fit_transform(x_train)
x_test_v = vect.transform(x_test)

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
rfc = RandomForestClassifier()

In [35]:
rfc.fit(x_train_v, y_train)

In [36]:
y_pred = rfc.predict(x_test_v)

In [37]:
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

In [39]:
accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted')

(0.7910166570936243, 0.7847591138175354)

In [40]:
confusion_matrix(y_test, y_pred)

array([[19681,   719,   961],
       [  994,  8297,  1803],
       [ 2897,  1722,  6451]], dtype=int64)

In [41]:
combined = pd.DataFrame(dict(actual=y_test, predicted=y_pred))

In [43]:
pd.crosstab(index=combined.actual, columns=combined.predicted)

predicted,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,19681,719,961
1,994,8297,1803
2,2897,1722,6451
