In [1]:
import json
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Pobranie zasobów WordNet
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samue\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\samue\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samue\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:

df = pd.read_json('tweet_subset_15.json')

label = pd.read_csv('label.csv')
label['id'] = label['id'].apply(lambda x: str(x).lstrip('u'))

df = df[df['lang'] == 'en']
df = df[['text', 'author_id']]
label['id'] = label['id'].astype('Int64')
df = pd.merge(df, label, left_on='author_id', right_on='id', how='left')


In [4]:
human_indices = df[df['label'] == 'human'].sample(n=(df['label'] == 'bot').sum(), random_state=42).index
# Zachowaj tylko te losowe 'human' oraz wszystkie boty (label == 1)
df = df.loc[human_indices.union(df[df['label'] == 'bot'].index)].reset_index(drop=True)

In [5]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# Prepare texts for training
texts = df['text'].tolist()

# Initialize and train BPE tokenizer
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens=["[UNK]", "[PAD]"], vocab_size=10000)
tokenizer.train_from_iterator(texts, trainer)

# Tokenize using BPE
df['BPE_TOKENS'] = df['text'].apply(lambda x: tokenizer.encode(x).tokens)

In [7]:
df['label'] = df['label'].apply(lambda x: 1 if x == 'bot' else 0)

In [8]:
# Usuwanie stopwords
stop_words = set(stopwords.words('english'))
df['FILTERED'] = df['BPE_TOKENS'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

df

Unnamed: 0,text,author_id,id,label,BPE_TOKENS,FILTERED
0,#LVMSpoilers episode 10? Amazing,1304855289208819713,1304855289208819713,0,"[#, LV, MS, po, il, ers, episode, 10, ?, Amazing]","[#, LV, MS, po, il, ers, episode, 10, ?, Amazing]"
1,Just IMAGINE if @TheVulcanSalute chose to shar...,1304855289208819713,1304855289208819713,0,"[Just, IM, AG, INE, if, @, The, V, ul, can, Sa...","[IM, AG, INE, @, V, ul, Sal, ute, cho, se, sha..."
2,RT @LaurenWalshArt: The “Matt Mercer Effect” f...,1304855289208819713,1304855289208819713,0,"[RT, @, La, uren, Wal, sh, Art, :, The, “, Mat...","[RT, @, La, uren, Wal, sh, Art, :, “, Matt, Me..."
3,Fearne you bad bitch #CriticalRoleSpoilers,1304855289208819713,1304855289208819713,0,"[F, ear, ne, you, bad, bit, ch, #, Crit, ical,...","[F, ear, ne, bad, bit, ch, #, Crit, ical, Ro, ..."
4,@Thee_MC89 He keeps Clean sheets in all his mo...,999238148075991040,999238148075991040,0,"[@, The, e_, MC, 89, He, keeps, Cle, an, she, ...","[@, e_, MC, 89, keeps, Cle, ets, mo, vies, bal..."
...,...,...,...,...,...,...
117265,Here's my #AcademicChatter: I have some dept. ...,16298298,16298298,0,"[Here, ', s, my, #, Academic, Ch, atter, :, I,...","[', #, Academic, Ch, atter, :, de, pt, ., ut, ..."
117266,RT @GoKeyNetwork: It's happening...\n\n#Cardan...,825920479055671296,825920479055671296,0,"[RT, @, Go, Key, Network, :, It, ', s, happeni...","[RT, @, Go, Key, Network, :, ', happening, ......"
117267,holy shit look at this cast. ONE WEEK until @h...,23027237,23027237,0,"[hol, y, shit, look, at, this, cast, ., ONE, W...","[hol, shit, look, cast, ., ONE, EK, @, hr, net..."
117268,RT @NerdPokerQuotes: .@evazan: He's amazing wi...,137865227,137865227,0,"[RT, @, N, er, d, Po, ker, Qu, otes, :, .@, ev...","[RT, @, N, er, Po, ker, Qu, otes, :, .@, ev, a..."


In [9]:
print(df[['text', 'FILTERED']].head(10))

                                                text  \
0                   #LVMSpoilers episode 10? Amazing   
1  Just IMAGINE if @TheVulcanSalute chose to shar...   
2  RT @LaurenWalshArt: The “Matt Mercer Effect” f...   
3         Fearne you bad bitch #CriticalRoleSpoilers   
4  @Thee_MC89 He keeps Clean sheets in all his mo...   
5  @Teambridge263 @Tommy_Le_Roi Someone translate...   
6                                 @niza116 Respect 💯   
7  Just posted a photo @ Zoom Online https://t.co...   
8  Just posted a photo @ Featherstone Center for ...   
9  RT @HotspursTurkey: GOL SONNY🔥 #COYS https://t...   

                                            FILTERED  
0  [#, LV, MS, po, il, ers, episode, 10, ?, Amazing]  
1  [IM, AG, INE, @, V, ul, Sal, ute, cho, se, sha...  
2  [RT, @, La, uren, Wal, sh, Art, :, “, Matt, Me...  
3  [F, ear, ne, bad, bit, ch, #, Crit, ical, Ro, ...  
4  [@, e_, MC, 89, keeps, Cle, ets, mo, vies, bal...  
5  [@, Team, bridge, 26, 3, @, Tom, _, Le, _, Ro,... 

In [10]:
# Wektoryzacja 
vectorizer = TfidfVectorizer()
df['FILTERED_TEXT'] = df['FILTERED'].apply(lambda x: ' '.join(x))
X = vectorizer.fit_transform(df['FILTERED_TEXT'])
Y = df['label']

# Budowa modelu

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

# Podział danych na zbiór treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=47)

# Uczenie modelu
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predykcja
y_pred = nb_classifier.predict(X_test)

# Ocena modelu
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='weighted'))
print('Recall:', recall_score(y_test, y_pred, average='weighted'))
print('F1 Score:', f1_score(y_test, y_pred, average='weighted'))
print('Classification Report:', classification_report(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

with open('nb_classifier1.pkl', 'wb') as f:
	pickle.dump(nb_classifier, f)

Accuracy: 0.5958471902447344
Precision: 0.5960625070702251
Recall: 0.5958471902447344
F1 Score: 0.5957735613052197
Classification Report:               precision    recall  f1-score   support

           0       0.59      0.61      0.60     11645
           1       0.60      0.58      0.59     11809

    accuracy                           0.60     23454
   macro avg       0.60      0.60      0.60     23454
weighted avg       0.60      0.60      0.60     23454

Confusion Matrix: 
 [[7110 4535]
 [4944 6865]]


In [12]:
from sklearn.svm import LinearSVC
# Uczenie modelu
svm_classifier = LinearSVC(random_state=2410, max_iter=10000)
svm_classifier.fit(X_train, y_train)

# Predykcja
y_pred = svm_classifier.predict(X_test)

# Ocena modelu
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='weighted'))
print('Recall:', recall_score(y_test, y_pred, average='weighted'))
print('F1 Score:', f1_score(y_test, y_pred, average='weighted'))
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

with open('svm_classifier1.pkl', 'wb') as f:
	pickle.dump(svm_classifier, f)




Accuracy: 0.5958045535942696
Precision: 0.5958935073460159
Recall: 0.5958045535942696
F1 Score: 0.5957946780807973
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.60      0.60     11645
           1       0.60      0.59      0.59     11809

    accuracy                           0.60     23454
   macro avg       0.60      0.60      0.60     23454
weighted avg       0.60      0.60      0.60     23454

Confusion Matrix:
 [[7017 4628]
 [4852 6957]]


In [14]:
from sklearn.ensemble import RandomForestClassifier

# Uczenie modelu
rf_classifier = RandomForestClassifier(n_estimators= 200, max_depth=100, max_features="sqrt", n_jobs=1, random_state=7)
rf_classifier.fit(X_train, y_train)

# Predykcja
y_pred = rf_classifier.predict(X_test)

# Ocena modelu
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='weighted'))
print('Recall:', recall_score(y_test, y_pred, average='weighted'))
print('F1 Score:', f1_score(y_test, y_pred, average='weighted'))
print('Classification Report:', classification_report(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

with open('rf_classifier1.pkl', 'wb') as f:
	pickle.dump(rf_classifier, f)

Accuracy: 0.612432847275518
Precision: 0.6127489961826476
Recall: 0.612432847275518
F1 Score: 0.6123168040181157
Classification Report:               precision    recall  f1-score   support

           0       0.61      0.63      0.62     11645
           1       0.62      0.59      0.61     11809

    accuracy                           0.61     23454
   macro avg       0.61      0.61      0.61     23454
weighted avg       0.61      0.61      0.61     23454

Confusion Matrix: 
 [[7348 4297]
 [4793 7016]]


# Zapisanie modelu

In [15]:
from pickle import dump, load

classifiers = {
    'nb_classifier.pkl': nb_classifier,
    'svm_classifier.pkl': svm_classifier,
    'rf_classifier.pkl': rf_classifier,
    'vectorizer.pkl': vectorizer,
}

for filename, classifier in classifiers.items():
    with open(filename, 'wb') as f:
        dump(classifier, f, protocol=5)

loaded_classifiers = {}
for filename in classifiers.keys():
    with open(filename, 'rb') as f:
        loaded_classifiers[filename] = load(f)