In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as mt
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

### tf-idf vectorizer generation part 

In [2]:
# I did not use the target directory so need to add it after unifying the path of the files in text pre-processing.ipynb file.

# training data
clean_train = []

for line in open("train.txt", "r"):
    clean_train.append(line.strip().split(','))

for n in range(0,len(clean_train)):
    clean_train[n] = clean_train[n][1:]
    
# testing data
clean_test = []

for line in open("test.txt", "r"):
    clean_test.append(line.strip().split(','))

for n in range(0,len(clean_test)):
    clean_test[n] = clean_test[n][1:]

In [3]:
# training data
train_df = pd.DataFrame(clean_train)
train_df['Words'] = train_df[train_df.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1)
train_df.drop(train_df.iloc[:,1:281], inplace=True, axis=1)
train_df=train_df.rename(columns = {0:'Label'})

In [4]:
train_df

Unnamed: 0,Label,Words
0,cs,"save,special,case,current,training,method,gene..."
1,math.DS,"consider,dynamical,system,finitely,many,equili..."
2,cs,"consider,discrete,dynamical,system,ant,like,ag..."
3,cs,"retrofit,technique,inject,external,resource,wo..."
4,cs,"approach,decision,make,uncertainty,belief,func..."
...,...,...
29633,cs,"powerful,deep,network,architecture,generative,..."
29634,math.AG,"develop,mixed,characteristic,version,mori,muka..."
29635,cs,"complex,analysis,wind,number,measure,number,ti..."
29636,cs,"discus,secure,computation,modular,sum,multiple..."


In [5]:
train_words = train_df.Words.values.tolist()
train_label = train_df.Label.values.tolist()

In [6]:
# testing data
test_df = pd.DataFrame(clean_test)
test_df['Words'] = test_df[test_df.columns[0:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1)
test_df.drop(test_df.iloc[:,0:280], inplace=True, axis=1)
test_df["Label"] = np.nan

In [7]:
test_df

Unnamed: 0,Words,Label
0,"method,model,average,become,important,tool,dea...",
1,"unmanned,aerial,vehicle,uav,system,increasingl...",
2,"paper,propose,new,loss,function,call,generaliz...",
3,"show,integrate,weak,morphism,lie,algebra,cross...",
4,"caustic,occur,widely,dynamic,take,shape,classi...",
...,...,...
7405,"statistical,inference,evolutionary,parameter,m...",
7406,"present,deep,learn,framework,base,generative,a...",
7407,"cell,receptor,tcr,repertoire,data,contain,info...",
7408,"paper,provide,modern,synthesis,classic,inverse...",


In [8]:
test_words = test_df.Words.values.tolist()

- Training & Valid sets

In [9]:
seed = 123
test_size = 0.2
x_train, x_valid, y_train, y_valid = train_test_split(train_words, train_label, test_size=test_size, random_state=seed)

* TF-IDF with n-gram

In [10]:
# https://stackoverflow.com/questions/45883679/train-model-fails-because-list-object-has-no-attribute-lower

def dummy(doc):
    return doc

tfidfv = TfidfVectorizer(tokenizer=dummy,preprocessor=dummy,ngram_range=(1,3))

# training
train_tfidf = tfidfv.fit_transform(x_train)
train_target = np.asarray(y_train)

#valid
valid_tfidf = tfidfv.transform(x_valid)
valid_target = np.asarray(y_valid)

# testing
test_tfidf = tfidfv.transform(test_words)

### Model

### 2. Random Forest

- train a model

In [11]:
#linear SVC model
rf_model = RandomForestClassifier()

In [12]:
rf_model.fit(train_tfidf, train_target)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
y_prediction = rf_model.predict(valid_tfidf)

- performance metrics of the  model

In [14]:
######## Can these metrics be applied to random forest? 
recall = recall_score(valid_target, y_prediction, average='macro')
precision = precision_score(valid_target, y_prediction, average='macro')
f1score = f1_score(valid_target, y_prediction, average='macro')
accuracy = accuracy_score(valid_target, y_prediction)

print('Confusion Matrix:\n',confusion_matrix(valid_target, y_prediction),'\n')
print('Accuracy:', str(accuracy))
print('Precision:', str(precision))
print('Recall:', str(recall))
print('F1 score:', str(f1score))

Confusion Matrix:
 [[3 0 5 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [3 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 3]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 3 0 0]] 

Accuracy: 0.3441295546558704
Precision: 0.03876114709913876
Recall: 0.034444585666162084
F1 score: 0.034095821481531764


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


- prediction on a test dataset

In [19]:
test_target = rf_model.predict(test_tfidf)

In [20]:
test_df["Label"] = test_target
test_df['test_id'] = list(range(1,len(test_df)+1))

del test_df['Words']
final = test_df[['test_id', 'Label']]

final

Unnamed: 0,test_id,Label
0,1,cs
1,2,cs
2,3,cs
3,4,math.CT
4,5,astro-ph.SR
...,...,...
7405,7406,cs
7406,7407,cs
7407,7408,cs
7408,7409,cs


In [21]:
final.to_csv('./Predictions_rf.csv', index = False)