In [59]:
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from qwlist.qwlist import QList
import numpy as np

In [7]:
df = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8')
df.head()

Unnamed: 0,sentence,target
0,Dla mnie faworytem do tytułu będzie Cracovia. ...,0
1,@anonymized_account @anonymized_account Brawo ...,0
2,"@anonymized_account @anonymized_account Super,...",0
3,@anonymized_account @anonymized_account Musi. ...,0
4,"Odrzut natychmiastowy, kwaśna mina, mam problem",0


In [14]:
df.groupby('target').count()

Unnamed: 0_level_0,sentence
target,Unnamed: 1_level_1
0,9190
1,851


In [8]:
with open('../data/stopwords_pl.txt', 'r', encoding='utf-8') as file:
    stopwords = [line.strip() for line in file.readlines()]

In [9]:
def remove_usernames(text: str):
    s = QList(text.split(' '))
    if '@anonymized_account' in s:
        return " ".join(s.filter(lambda x: x != "@anonymized_account"))
    return " ".join(s)

In [11]:
clean_df = df
clean_df['sentence'] = df['sentence'].apply(remove_usernames)

In [12]:
clean_df.head()

Unnamed: 0,sentence,target
0,Dla mnie faworytem do tytułu będzie Cracovia. ...,0
1,Brawo ty Daria kibic ma być na dobre i złe,0
2,"Super, polski premier składa kwiaty na grobach...",0
3,Musi. Innej drogi nie mamy.,0
4,"Odrzut natychmiastowy, kwaśna mina, mam problem",0


In [13]:
train_df, test_df = train_test_split(clean_df)
train_df.head()

Unnamed: 0,sentence,target
9603,"RT Może szukają kasy, bo głównemu reklamodawcy...",0
5563,Czyli koalicja KO i PSL ma 264 o 10 więcej. Ro...,0
8265,"A tak, znam tę zasadę ;-)",0
6097,Mniemam że nie masz z tym problemu 😁😁😁😁,0
96,Prezes się cieszy ze Wisla odpadła bo w razie ...,0


In [49]:
cv = CountVectorizer(stop_words=stopwords, max_features=1000)
cv.fit(train_df['sentence'])

In [62]:
tfidf = TfidfVectorizer(stop_words=stopwords, max_features=1000)
tfidf.fit(train_df['sentence'])

In [56]:
x_train = cv.transform(train_df['sentence']).toarray()
y_train = train_df['target'].values

x_test = cv.transform(test_df['sentence']).toarray()
y_test = test_df['target'].values

In [60]:
svm = SVC()
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)
f1_score(y_test, y_pred)

0.028037383177570093

In [61]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
f1_score(y_test, y_pred)

0.21951219512195125

In [63]:
x_train = tfidf.transform(train_df['sentence']).toarray()
y_train = train_df['target'].values

x_test = tfidf.transform(test_df['sentence']).toarray()
y_test = test_df['target'].values

In [64]:
svm = SVC()
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)
f1_score(y_test, y_pred)

0.08849557522123894

In [65]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
f1_score(y_test, y_pred)

0.1992619926199262