In [None]:
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from pickle import dump

In [None]:
vectorizer = TfidfVectorizer()
svc_classifier = SVC(kernel="rbf", class_weight="balanced")

In [None]:
df = pd.read_csv("../data/App_Training.csv", names=["sno", "id", "text", "lbl"])
Xtrain = vectorizer.fit_transform([x.strip() for x in df["text"]])
ytrain = df["lbl"].tolist()

In [None]:
svc_classifier.fit(Xtrain, ytrain)

In [None]:
df = pd.read_csv("../data/App_Test_Labeled.csv", names=["sno", "id", "text", "lbl"])
Xtest = vectorizer.transform([x.strip() for x in df["text"]])
ytest = df["lbl"].tolist()

In [None]:
ypred = svc_classifier.predict(Xtest)
print(f"test f1_score (SUG): {f1_score(ytest, ypred, zero_division=0):.4f}\n")
print(classification_report(ytest, ypred, target_names=['Positive', 'Negative'], digits=4))

In [None]:
# tsne = TSNE(n_components=2, 
#             learning_rate='auto',
#             n_iter=5000,
#             init='random',
#             random_state=101,
#             n_jobs=-1)

# Xtrain_tsne = tsne.fit_transform(Xtrain)

# df_tsne = pd.DataFrame()
# df_tsne["y"] = [("non-suggestion" if not y else "suggestion") for y in ytrain]
# df_tsne["comp-1"] = Xtrain_tsne[:,0]
# df_tsne["comp-2"] = Xtrain_tsne[:,1]
# markers = {"non-suggestion": "s", "suggestion": "X"}

# fig = sns.scatterplot(x="comp-1", 
#                       y="comp-2", 
#                       hue=df_tsne.y.tolist(),
#                       palette=['green','red'], 
#                       style="y",
#                       markers=markers,
#                       data=df_tsne)

# plt.legend(bbox_to_anchor=(1.02, 0.55), loc='upper left', borderaxespad=0)
# fig.set(title="TF-IDF TSNE visualization (SUG Train)") 

In [None]:
# tsne = TSNE(n_components=2, 
#             learning_rate='auto',
#             n_iter=5000,
#             init='random',
#             random_state=101,
#             n_jobs=-1)

# Xtest_tsne = tsne.fit_transform(Xtest)

# df_tsne = pd.DataFrame()
# df_tsne["y"] = [("non-suggestion" if not y else "suggestion") for y in ytest]
# df_tsne["comp-1"] = Xtest_tsne[:,0]
# df_tsne["comp-2"] = Xtest_tsne[:,1]
# markers = {"non-suggestion": "s", "suggestion": "X"}

# fig = sns.scatterplot(x="comp-1", 
#                       y="comp-2", 
#                       hue=df_tsne.y.tolist(),
#                       palette=['red','green'], 
#                       style="y",
#                       markers=markers,
#                       data=df_tsne)

# plt.legend(bbox_to_anchor=(1.02, 0.55), loc='upper left', borderaxespad=0)
# fig.set(title="TF-IDF TSNE visualization (SUG Test)") 

In [None]:
# SAVE THE SVC and TF-IDF for future use
with open("./saved_weights/tfidf_vectorizer_sug.pkl", 'wb') as f:
    vectorizer = dump(vectorizer, f)

with open("./saved_weights/svc_sug.pkl", 'wb') as f:
    svc_classifier = dump(svc_classifier, f)

# Use NER Data and perform Suggestion Classification

In [None]:
vectorizer = TfidfVectorizer()
svc_classifier = SVC(kernel="rbf", class_weight="balanced")

In [None]:
df_ner = pd.read_csv("../data/train_290818.txt", 
                     sep=' ',
                     header=None,
                     names=['a', 'b', 'c'],
                     encoding="utf-8",
                     converters={'a': pd.eval, 
                                 'b': pd.eval})

df_ner['c'] = df_ner['c'].apply(lambda x: 0 if not x else 1)
df_ner['a'] = df_ner['a'].apply(lambda x: ' '.join(x))

Xtrain_ner = vectorizer.fit_transform([x for x in df_ner['a']])
ytrain_ner = df_ner['c'].tolist()

In [None]:
svc_classifier.fit(Xtrain_ner, ytrain_ner)

In [None]:
df_ner = pd.read_csv("../data/test_290818.txt", 
                     sep=' ',
                     header=None,
                     names=['a', 'b', 'c'],
                     encoding="utf-8",
                     converters={'a': pd.eval, 
                                 'b': pd.eval})

df_ner['c'] = df_ner['c'].apply(lambda x: 0 if not x else 1)
df_ner['a'] = df_ner['a'].apply(lambda x: ' '.join(x))

Xtest_ner = vectorizer.transform([x for x in df_ner['a']])
ytest_ner = df_ner['c'].tolist()

In [None]:
ypred_ner = svc_classifier.predict(Xtest_ner)
print(f"test f1_score (NER): {f1_score(ytest_ner, ypred_ner, zero_division=0):.4f}\n")
print(classification_report(ytest_ner, ypred_ner, target_names=['Positive', 'Negative'], digits=4))

In [None]:
# tsne = TSNE(n_components=2, 
#             learning_rate='auto',
#             n_iter=5000,
#             init='random',
#             random_state=101,
#             n_jobs=-1)

# Xtrain_ner_tsne = tsne.fit_transform(Xtrain_ner)

# df_ner_tsne = pd.DataFrame()
# df_ner_tsne["y"] = [("non-suggestion" if not y else "suggestion") for y in ytrain_ner]
# df_ner_tsne["comp-1"] = Xtrain_ner_tsne[:,0]
# df_ner_tsne["comp-2"] = Xtrain_ner_tsne[:,1]
# markers = {"non-suggestion": "s", "suggestion": "X"}

# fig = sns.scatterplot(x="comp-1", 
#                       y="comp-2", 
#                       hue=df_ner_tsne.y.tolist(),
#                       palette=['red','green'], 
#                       style="y",
#                       markers=markers,
#                       data=df_ner_tsne)

# plt.legend(bbox_to_anchor=(1.02, 0.55), loc='upper left', borderaxespad=0)
# fig.set(title="TF-IDF TSNE visualization (NER Train)") 

In [None]:
# tsne = TSNE(n_components=2, 
#             learning_rate='auto',
#             n_iter=5000,
#             init='random',
#             random_state=101,
#             n_jobs=-1)

# Xtest_ner_tsne = tsne.fit_transform(Xtest_ner)

# df_ner_tsne = pd.DataFrame()
# df_ner_tsne["y"] = [("non-suggestion" if not y else "suggestion") for y in ytest_ner]
# df_ner_tsne["comp-1"] = Xtest_ner_tsne[:,0]
# df_ner_tsne["comp-2"] = Xtest_ner_tsne[:,1]
# markers = {"non-suggestion": "s", "suggestion": "X"}

# fig = sns.scatterplot(x="comp-1", 
#                       y="comp-2", 
#                       hue=df_ner_tsne.y.tolist(),
#                       palette=['red','green'], 
#                       style="y",
#                       markers=markers,
#                       data=df_ner_tsne)

# plt.legend(bbox_to_anchor=(1.02, 0.55), loc='upper left', borderaxespad=0)
# fig.set(title="TF-IDF TSNE visualization (NER Test)") 

In [None]:
# SAVE THE SVC and TF-IDF for future use
with open("./saved_weights/tfidf_vectorizer_ner.pkl", 'wb') as f:
    vectorizer = dump(vectorizer, f)

with open("./saved_weights/svc_ner.pkl", 'wb') as f:
    svc_classifier = dump(svc_classifier, f)