In [9]:
import pandas as pd

df = pd.read_json('../data/labeled_data_for_supervised.json')

In [10]:
phrases = []

with open('../data/suspicious_phrases.txt') as f:
    for line in f.readlines():
        phrases.append(line.strip())

df['is_suspicious'] = df['text'].apply(lambda x : 1 if any([phrase in x for phrase in phrases]) else 0)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,5), min_df=20)
tfidf_data = tfidf.fit_transform(df['text'])

In [12]:
tfidf_matrix = pd.DataFrame(tfidf_data.todense(), columns = tfidf.get_feature_names_out()).astype('float32')
tfidf_matrix

Unnamed: 0,00,00 15,00 15 00,00 16,00 16 00,00 17,00 17 00,00 do,00 do 13,00 do 13 500,...,życia,życie,życiem,życiu,żywo,żywo klientemnie,żywo klientemnie wymagamy,żywo klientemnie wymagamy dzwonienia,żywo klientemnie wymagamy dzwonienia po,żądania
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
import pickle
pickle.dump(tfidf, open("TFIDF_serialized.pickle", "wb+"))

In [14]:
from sklearn.ensemble import RandomForestClassifier

X = tfidf_matrix
X['is_suspicious'] = df['is_suspicious']
y = df['label']

In [15]:
clf = RandomForestClassifier(random_state=0)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
clf.fit(X_train, y_train)

In [19]:
pred = clf.predict(X_test)

In [20]:
pred = pd.DataFrame(pred)
pred

Unnamed: 0,0
0,1
1,0
2,0
3,1
4,1
...,...
1125,1
1126,0
1127,1
1128,0


In [21]:
from sklearn.metrics import confusion_matrix, f1_score

print(f1_score(y_test, pred))
confusion_matrix(y_test, pred)

0.952712100139082


array([[377,  18],
       [ 50, 685]], dtype=int64)

In [22]:
from joblib import dump
dump(clf, 'random_forest_model.joblib')

['random_forest_model.joblib']