In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download('punkt')
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, precision_recall_curve
from matplotlib import pyplot as plt
from sklearn.metrics import plot_precision_recall_curve
import numpy as np
from sklearn.model_selection import GridSearchCV
import joblib
from tokenizer import tokenize_sentence
from tokenizer import tokenize_sentence_ru

In [None]:
df = pd.read_csv("./data/dataset_test_english.csv", sep=",")

In [None]:
train_df = pd.read_csv("./data/dataset_train_english.csv", sep=",")

In [None]:
df.dtypes

In [None]:
df.head(5)

In [None]:
df["type"] = df["type"].apply(int)
train_df["type"] = train_df["type"].apply(int)

In [None]:
df[df['type'] == 1].head(5)

In [None]:
df[df['type'] == 0].head(5)

In [None]:
sentence_example = df.iloc[1]["comment"]
tokenize_sentence(sentence_example)

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tokenize_sentence)

In [None]:
features = vectorizer.fit_transform(train_df["comment"])

In [None]:
model = LogisticRegression(random_state=0)
model.fit(features, train_df["type"])

In [None]:
model.predict(features[0])

In [None]:
train_df["comment"].iloc[0]

In [None]:
model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=tokenize_sentence)),
    ("model", LogisticRegression(random_state=0))
]
)

In [None]:
model_pipeline.fit(train_df["comment"], train_df["type"])

In [None]:
pos_text = df['comment'].iloc[0]
neg_text = df[df['type']==0]['comment'].iloc[0]

In [None]:
pos_text

In [None]:
neg_text

In [None]:
model_pipeline.predict([pos_text])

In [None]:
model_pipeline.predict([neg_text])

In [None]:
precision_score(y_true=df["type"], y_pred=model_pipeline.predict(df["comment"]))

In [None]:
recall_score(y_true=df["type"], y_pred=model_pipeline.predict(df["comment"]))

In [None]:
prec, rec, thresholds = precision_recall_curve(y_true=df["type"], probas_pred=model_pipeline.predict_proba(df["comment"])[:, 1])

In [None]:
plot_precision_recall_curve(estimator=model_pipeline, X=df["comment"], y=df["type"])

In [None]:
min_threshold = np.where(prec > 0.95)[0][0]
min_threshold

In [None]:
np.where(prec > 0.95)

In [None]:
thresholds[min_threshold]

In [None]:
precision_score(y_true=df["type"], y_pred=model_pipeline.predict_proba(df["comment"])[:, 1] > thresholds[min_threshold])

In [None]:
recall_score(y_true=df["type"], y_pred=model_pipeline.predict_proba(df["comment"])[:, 1] > thresholds[min_threshold])

In [None]:
grid_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=tokenize_sentence)),
    ("model", 
     GridSearchCV(
        LogisticRegression(random_state=0),
        param_grid={'C': [0.1, 1, 10.]},
        cv=3,
         verbose=4
        )
    )
])

In [None]:
grid_pipeline.fit(train_df["comment"], train_df["type"])

In [None]:
model_pipeline_c_10 = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=tokenize_sentence)),
    ("model", LogisticRegression(random_state=0, C=10.))
]
)

In [None]:
model_pipeline_c_10.fit(train_df["comment"], train_df["type"])

In [None]:
prec_c_10, rec_c_10, thresholds_c_10 = precision_recall_curve(y_true=df["type"], probas_pred=model_pipeline_c_10.predict_proba(df["comment"])[:, 1])

In [None]:
min_thresholdC10 = np.where(prec_c_10 > 0.95)[0][0]

In [None]:
np.where(prec_c_10 > 0.95)

In [None]:
precision_score(y_true=df["type"], y_pred=model_pipeline_c_10.predict_proba(df["comment"])[:, 1] > thresholds_c_10[min_thresholdC10])

In [None]:
recall_score(y_true=df["type"], y_pred=model_pipeline_c_10.predict_proba(df["comment"])[:, 1] > thresholds_c_10[min_thresholdC10])

In [None]:
joblib.dump(model_pipeline, 'modelpipeline.joblib')

In [None]:
joblib.dump(model_pipeline_c_10, 'modelpipeline_c_10.pkl')

In [9]:
df_ru = pd.read_csv('./data/dataset_train_ru.csv', sep=',')

In [10]:
df_ru['type'] = df_ru['type'].apply(int)

In [11]:
df_ru.dtypes

Unnamed: 0     int64
comment       object
type           int64
dtype: object

In [12]:
df_ru.shape

(8263, 3)

In [13]:
df_ru.head(5)

Unnamed: 0.1,Unnamed: 0,comment,type
0,0,Досудебное расследование по факту покупки ЕНПФ...,0
1,1,Медики рассказали о состоянии пострадавшего му...,0
2,2,"Прошел почти год, как железнодорожным оператор...",0
3,3,По итогам 12 месяцев 2016 года на территории р...,0
4,4,Астана. 21 ноября. Kazakhstan Today - Агентств...,0


In [14]:
model_pipeline_ru = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=tokenize_sentence_ru)),
    ("model", LogisticRegression(random_state=0))
]
)

In [15]:
model_pipeline_ru.fit(df_ru['comment'], df_ru['type'])

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function tokenize_sentence_ru at 0x000002AA70E2CA60>)),
                ('model', LogisticRegression(random_state=0))])

In [17]:
precision_score(y_true=df_ru["type"], y_pred=model_pipeline_ru.predict(df_ru["comment"]))

0.9092014592622618

In [18]:
recall_score(y_true=df_ru["type"], y_pred=model_pipeline_ru.predict(df_ru["comment"]))

0.9853565675794406

In [19]:
joblib.dump(model_pipeline_ru, 'modelpipeline_ru.pkl')

['modelpipeline_ru.pkl']