In [1]:
#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import average_precision_score
from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve
from util.rank_metrics import mean_average_precision


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import xgboost as xgb
xgb.set_config(verbosity=1)

from util.Util import plot_accuracies, read_csv
from df_features import DataFrameFeatures
from df_classifier import DataFrameClassifier


In [2]:
# Read GLOVE embeddings from file
glove_df = read_csv('util/glove/glove.42B.300d.txt', sep=" ", index_col=0, quoting=3, header=None)
glove_embeddings = {key: val.values for key, val in tqdm(glove_df.T.items())}
del glove_df # Delete glove_df to free up memory

* Reading CSV from path: util/glove/glove.42B.300d.txt. Size: (1917494, 300)


1917494it [00:22, 86552.07it/s]


In [21]:
# Read the cleaned dataframe
full_df = read_csv('data/comparison_data/agnews_cleaned.csv')
full_df['id'] = range(len(full_df))
full_df = full_df[(full_df['Rating'] == 1) | (full_df['Rating'] == 2)]
full_df['Rating'] = full_df['Rating'] - 1
full_df['NACE'] = 1111.0

# Create tf-idf vectorizer using the main col and a max of 250 features
main_col = 'description_no_stopwords'
vectorizer = TfidfVectorizer(max_features=250)
vectorizer.fit_transform(full_df[main_col])

full_dff = DataFrameFeatures(df=full_df, vectorizer=vectorizer, main_col=main_col, verbose=False)

* Reading CSV from path: data/comparison_data/agnews_cleaned.csv. Size: (7190, 4)


In [22]:
train_indices, test_indices = train_test_split(range(0,len(full_dff.documents())), test_size=0.33, random_state=42)

### Classify with tf-idf vectors

In [None]:
dfc = DataFrameClassifier(full_dff, glove_embeddings)

X, y = dfc.get_tfidf_vectors()

X_train, y_train = X[train_indices], y[train_indices]
X_test, y_test = X[test_indices], y[test_indices]

In [None]:
clf = LogisticRegression(random_state=0, class_weight='balanced', fit_intercept=False).fit(X_train, y_train)
clf.score(X_test,y_test)

### Classify with feature vectors

In [23]:
# Takes a while to run! 
dfc = DataFrameClassifier(full_dff, glove_embeddings)
X, y = dfc.get_feature_vectors()

Initializing DataFrameFeatures object


In [24]:
X_train, y_train = X[train_indices], y[train_indices]
X_test, y_test = X[test_indices], y[test_indices]

In [25]:
clf = LogisticRegression(random_state=0, class_weight='balanced', fit_intercept=False).fit(X_train, y_train)
clf.score(X_test,y_test)


0.5360995850622406

In [26]:
X # Two first features dont work on new dataset

array([[0.0, 0.0, 0.7623876658539033, 1.0, 0.7623876658539033],
       [0.0, 0.0, 0.7677636778654313, 1.0, 0.7677636778654313],
       [0.0, 0.0, 0.8045612947803527, 1.0, 0.8045612947803527],
       ...,
       [0.0, 0.0, 0.7425873419034302, 1.0, 0.7425873419034302],
       [0.0, 0.0, 0.7156082675592896, 1.0, 0.7156082675592896],
       [0.0, 0.0, 0.8149729914073174, 1.0, 0.8149729914073174]],
      dtype=object)