In [124]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import pickle

In [125]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/Anonote/finalized_data/English_Comments_V1.csv')

In [126]:
df.tail(100)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
24683,25195,3,0,3,0,1,where r the fun twins at @TheJsimps I'm trying...
24684,25196,6,0,6,0,1,whheeet bitch you LYING
24685,25197,3,2,1,0,0,which one of these names is more offensive kik...
24686,25198,3,0,3,0,1,which twitter bitch isn't selling chokers&#100...
24687,25199,4,0,1,3,2,who's downie like a brownie 4 brunch? Need mim...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [127]:
df_ = df[df['class'] == 2]

In [128]:
df_.head(100)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
40,40,3,0,1,2,2,""" momma said no pussy cats inside my doghouse """
63,63,3,0,0,3,2,"""@Addicted2Guys: -SimplyAddictedToGuys http://..."
66,66,3,0,1,2,2,"""@AllAboutManFeet: http://t.co/3gzUpfuMev"" woo..."
67,67,3,0,1,2,2,"""@Allyhaaaaa: Lemmie eat a Oreo &amp; do these..."
...,...,...,...,...,...,...,...
698,709,3,0,0,3,2,"#DerekJeter as a lifelong #Yankees man, I will..."
701,712,3,0,0,3,2,#EarlyBird #early #morning #sunrise #dawn #bir...
702,713,3,0,0,3,2,#Ebola Great and I thought the panic from bird...
703,714,3,0,0,3,2,#Ebola is a great metaphor for how #GOP #teapa...


In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [130]:
df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,19190
2,4163
0,1430


There is a huge difference between data counts

Lets make it arond the count of class 2. Class 3 mix with the class 1 since we want to find the harmfull and harmless. The class 1 and 3 are in the category of harmful. Thats why we gonna merge them

In [131]:
#lets remove some unwanted columns
df.drop(columns=['Unnamed: 0','count', 'hate_speech', 'offensive_language', 'neither'],inplace=True)

In [132]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   24783 non-null  int64 
 1   tweet   24783 non-null  object
dtypes: int64(1), object(1)
memory usage: 387.4+ KB


In [133]:
df['P/N'] = df['class'].map({1: 0, 2 : 1, 3: 0})

In [134]:
df.head()

Unnamed: 0,class,tweet,P/N
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,1.0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,0.0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,0.0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,0.0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,0.0


In [135]:
df.drop(columns=['class'],inplace=True)
counts = df['P/N'].value_counts()
print(counts)

P/N
0.0    19190
1.0     4163
Name: count, dtype: int64


In [136]:
target_count = counts.min()

balanced_df = pd.concat([
    df[df['P/N'] == number].sample(target_count, random_state=42)
    for number in counts.index
])

In [137]:
balanced_df['P/N'].value_counts()

Unnamed: 0_level_0,count
P/N,Unnamed: 1_level_1
0.0,4163
1.0,4163


In [138]:
balanced_df.head()

Unnamed: 0,tweet,P/N
22889,Why is it everytime I go to cracker barrel the...,0.0
20565,"Run that nigga, you don't want that nigga, but...",0.0
10780,I need a girl from Jamaica I can't fuck with t...,0.0
17261,RT @ShadowBeatz_Inc: I know you have me blocke...,0.0
13954,Put ya hands up if you a Grade A bitch,0.0


In [139]:
nlp = spacy.load("en_core_web_sm")

In [140]:
def remove_punc_and_stopwords(text):
  doc = nlp(text)
  words = []
  for token in doc:
    if(not token.is_stop and not token.is_punct):
      words.append(token.lemma_)
  return " ".join(words)

In [141]:
print(remove_punc_and_stopwords('Hello! you must finish eating now'))

hello finish eat


yeah the function working properly

In [142]:
balanced_df['tweet'] = balanced_df['tweet'].apply(remove_punc_and_stopwords)

In [143]:
balanced_df.tail(100)

Unnamed: 0,tweet,P/N
13709,vote cripple amirite bigot MT @dawnnaduke vote...,1.0
20963,weird people ghetto time,1.0
10554,hope Charlie bring lube test,1.0
2380,@Adrian1_knowsu 9757;&#65039;no ghetto name 12...,1.0
13897,praise thy lord season beanie,1.0
...,...,...
20198,RT @tj_curtin bird eye view today harvest14 ac...,1.0
2869,@cogitoergobibo Penske run 22 NASCAR yellow sp...,1.0
18219,RT @Zach_Dorsey trash worth,1.0
22172,Thou shall mock ryan(the future)kelly,1.0


here we can observe some numbers and some words wiht @ and also some http linke which are not need to predict harmless


so i am gonna remove them

In [144]:
def remove_garbage_1(text):
  doc = nlp(text)
  words = []
  for token in doc:
    if not token.text.startswith('@') and not token.text.startswith('http://'):
      words.append(token.text)
  return " ".join(words)

In [145]:
balanced_df['tweet'] = balanced_df['tweet'].apply(remove_garbage_1)

In [146]:
X_train, X_test, y_train, y_test = train_test_split(
    balanced_df['tweet'],
    balanced_df['P/N'],
    test_size=0.2,
    random_state=42,
    stratify=balanced_df['P/N'],
)

In [147]:
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.87      0.96      0.92       833
         1.0       0.96      0.86      0.91       833

    accuracy                           0.91      1666
   macro avg       0.92      0.91      0.91      1666
weighted avg       0.92      0.91      0.91      1666



In [148]:
from sklearn.linear_model import LogisticRegression

clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.92      0.95       833
         1.0       0.93      0.99      0.96       833

    accuracy                           0.95      1666
   macro avg       0.96      0.95      0.95      1666
weighted avg       0.96      0.95      0.95      1666



In [149]:
#seems like the Logistic regression model is better at predicting
path = '/content/drive/MyDrive/Dataset/Anonote/models/english_comment_classifier_v1.pkl'

with open(path, 'wb') as file:
    pickle.dump(clf, file)

Want some partion of data to train a model to check wthere its a english or singlsh text

In [150]:
partion_df = pd.concat([
    df[df['P/N'] == number].sample(150, random_state=42)
    for number in counts.index
])

In [151]:
partion_df.head()

Unnamed: 0,tweet,P/N
22889,Why is it everytime I go to cracker barrel the...,0.0
20565,"Run that nigga, you don't want that nigga, but...",0.0
10780,I need a girl from Jamaica I can't fuck with t...,0.0
17261,RT @ShadowBeatz_Inc: I know you have me blocke...,0.0
13954,Put ya hands up if you a Grade A bitch,0.0


In [152]:
partion_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 22889 to 7947
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tweet   300 non-null    object 
 1   P/N     300 non-null    float64
dtypes: float64(1), object(1)
memory usage: 7.0+ KB


In [153]:
partion_df['tweet'] = partion_df['tweet'].apply(remove_garbage_1)

In [157]:
def remove_punc(text):
  doc = nlp(text)
  words = []
  for token in doc:
    if(not token.is_punct):
      words.append(token.lemma_)
  return " ".join(words)

In [162]:
partion_df['tweet'] = partion_df['tweet'].apply(remove_punc)

In [159]:
partion_df.drop(columns=['P/N'],inplace=True)

KeyError: "['P/N'] not found in axis"

In [163]:
partion_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 22889 to 7947
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   300 non-null    object
dtypes: object(1)
memory usage: 4.7+ KB


In [164]:
partion_df.to_csv('/content/drive/MyDrive/Dataset/Anonote/finalized_data/english_texts_partion.csv', index=False)