In [24]:
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import xgboost as xgb
xgb.set_config(verbosity=1)

from util.Util import plot_accuracies, read_csv
from df_features import DataFrameFeatures
from df_classifier import DataFrameClassifier

In [25]:
# Read GLOVE embeddings from file
glove_df = read_csv('util/glove/glove.42B.300d.txt', sep=" ", index_col=0, quoting=3, header=None)
glove_embeddings = {key: val.values for key, val in tqdm(glove_df.T.items())}
del glove_df # Delete glove_df to free up memory

* Reading CSV from path: util/glove/glove.42B.300d.txt. Size: (1917494, 300)


1917494it [00:22, 83892.67it/s]


In [26]:
# Read the cleaned dataframe
full_df = read_csv('data/comparison_data/agnews_cleaned.csv')
full_df = full_df[(full_df['Rating'] == 1) | (full_df['Rating'] == 2)]
full_df['Rating'] = full_df['Rating'] - 1
full_df['NACE'] = 1111.0
full_df['id'] = range(len(full_df))
full_df = full_df.iloc[:250]

# Create tf-idf vectorizer using the main col and a max of 250 features
main_col = 'description_no_stopwords_stemmed'
vectorizer = TfidfVectorizer(max_features=250)
vectorizer.fit_transform(full_df[main_col])

full_dff = DataFrameFeatures(df=full_df, vectorizer=vectorizer, main_col=main_col, verbose=False)

* Reading CSV from path: data/comparison_data/agnews_cleaned.csv. Size: (7190, 4)
main col: description_no_stopwords_stemmed


In [27]:
full_df

Unnamed: 0,Rating,description,description_no_stopwords,description_no_stopwords_stemmed,NACE,id
20,1,michael phelps won the gold medal in the indiv...,michael phelps gold medal individual medley se...,michael phelp gold medal individu medley set w...,1111.0,0
21,1,in quot helping themselves quot ricky bryant c...,quot helping quot ricky bryant chas gessner mi...,quot help quot ricki bryant chas gessner micha...,1111.0,1
22,1,the cleveland indians pulled within one game o...,cleveland indians pulled within one game al ce...,cleveland indian pull within one game al centr...,1111.0,2
23,0,canadian press vancouver cp the sister of a ma...,canadian press vancouver cp sister man died vi...,canadian press vancouv cp sister man die viole...,1111.0,3
24,0,ap the man who claims gov james e mcgreevey se...,ap man claims gov james e mcgreevey sexually h...,ap man claim gov jame e mcgreevey sexual haras...,1111.0,4
...,...,...,...,...,...,...
442,1,the san diego chargers finally reached a contr...,san diego chargers finally reached contract ag...,san diego charger final reach contract agreeme...,1111.0,245
443,1,the nfl s highest scoring offense is averaging...,nfl highest scoring offense averaging two touc...,nfl highest score offens averag two touchdown ...,1111.0,246
445,1,the us softball team completed its scorched ea...,us softball team completed scorched earth run ...,us softbal team complet scorch earth run olymp...,1111.0,247
448,1,athens the booing went on for nearly minutes w...,athens booing went nearly minutes paul hamm ch...,athen boo went near minut paul hamm chalk read...,1111.0,248


In [28]:
train_indices, test_indices = train_test_split(range(0,len(full_dff.documents())), test_size=0.33, random_state=42)

### Classify with tf-idf vectors

In [29]:
dfc = DataFrameClassifier(full_dff, glove_embeddings)

Initializing DataFrameFeatures object
main col: description_no_stopwords_stemmed


In [30]:
X, y = dfc.get_tfidf_vectors()

X_train, y_train = X[train_indices], y[train_indices]
X_test, y_test = X[test_indices], y[test_indices]

In [31]:
clf = LogisticRegression(random_state=0, class_weight='balanced', fit_intercept=False).fit(X_train, y_train)
clf.score(X_test,y_test)

0.891566265060241

### Classify with feature vectors

In [32]:
# Takes a while to run! 
X, y = dfc.get_feature_vectors()

X_train, y_train = X[train_indices], y[train_indices]
X_test, y_test = X[test_indices], y[test_indices]

In [33]:
clf = LogisticRegression(random_state=0, class_weight='balanced', fit_intercept=False).fit(X_train, y_train)
clf.score(X_test, y_test)


0.5783132530120482

In [34]:
clf.coef_

array([[0.34298049]])

In [35]:
clf = LogisticRegression(random_state=0, class_weight='balanced').fit(X_train, y_train)
clf.score(X_test, y_test)

0.6867469879518072

In [36]:
clf.coef_

array([[0.36535224]])