In [1]:
%reload_ext autoreload
%autoreload 2

import nltk
nltk.download('stopwords')
nltk.download('punkt')


#data_preprocessing
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk import word_tokenize
from string import punctuation
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV
from bs4 import BeautifulSoup 
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
from sklearn import svm
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_curve

##import dataset
dataset = pd.read_excel('train.xlsx')

##Fill missing values of Host with that link in the same row
for i in list(dataset[dataset['Host'].isnull() == True]['Host'].index):
    dataset.at[i,'Host'] = dataset.loc[i]['Link']
    
##Filling missing value of TRANS_CONV_TEXT with that of title
for i in list(dataset[dataset['TRANS_CONV_TEXT'].isnull() == True].index):
    dataset.at[i, 'TRANS_CONV_TEXT'] = dataset.loc[i]['Title']
##Converting 'Date(ET)' object to datetime
dataset['Date(ET)'] = pd.to_datetime(dataset['Date(ET)'], infer_datetime_format=True)    

##Converting Datetime to timestamp
dataset['Date(ET)'] = dataset[['Date(ET)']].apply(lambda x: x[0].timestamp(), axis = 1).astype(int)

##Dropping time(ET) and time(GMT)
dataset.drop(['Date(ET)', 'Time(ET)', 'time(GMT)'], axis = 1, inplace = True)

##Fill missing values of Title with that of TRANS_CONV_TEXT in the same row
for i in list(dataset[dataset['Title'].isnull() == True]['Title'].index):
    dataset.at[i,'Title'] = dataset.loc[i]['TRANS_CONV_TEXT']

## TEST Data
test = pd.read_csv('test.csv', encoding = 'utf-8')

##Fill missing values of Host with that link in the same row
for i in list(test[test['Host'].isnull() == True]['Host'].index):
    test.at[i,'Host'] = test.loc[i]['Link']
    
#Filling missing value of TRANS_CONV_TEXT with that of title
for i in list(test[test['TRANS_CONV_TEXT'].isnull() == True].index):
    test.at[i, 'TRANS_CONV_TEXT'] = test.loc[i]['Title']

test.at[441,'Date(ET)'] = test.loc[441, 'Time(ET)']

#Converting 'Date(ET)' object to datetime
test['Date(ET)'] = pd.to_datetime(test['Date(ET)'], infer_datetime_format=True)    

#Converting Datetime to timestamp
test['Date(ET)'] = test[['Date(ET)']].apply(lambda x: x[0].timestamp(), axis = 1).astype(int)

#Dropping time(ET) and time(GMT)
test.drop(['Date(ET)','Time(ET)', 'time(GMT)'], axis = 1, inplace = True)

#Fill missing values of Title with that of TRANS_CONV_TEXT in the same row
for i in list(test[test['Title'].isnull() == True]['Title'].index):
    test.at[i,'Title'] = test.loc[i]['TRANS_CONV_TEXT']

index = test['Index']
index = list(index)

test.drop(['Index'], axis = 1, inplace = True)

#feature_selection
y = dataset['Patient_Tag'].tolist()

dataset['Story'] = 'a'


for i in range(len(dataset)):
    dataset.at[i, 'Story'] = dataset['Source'][i] + ' ' + dataset['Host'][i] + ' ' + str(dataset['Link'][i]) + ' ' + dataset['Title'][i] + ' ' + dataset['TRANS_CONV_TEXT'][i]

dataset.drop(['Source', 'Host', 'Link', 'Title', 'TRANS_CONV_TEXT', 'Patient_Tag'], axis = 1, inplace = True)


test['Story'] = 'a'


for i in range(len(test)):
    test.at[i, 'Story'] = test['Source'][i] + ' ' + test['Host'][i] + ' ' + str(test['Link'][i]) + ' ' + test['Title'][i] + ' ' + test['TRANS_CONV_TEXT'][i]

test.drop(['Source', 'Host', 'Link', 'Title', 'TRANS_CONV_TEXT', 'Unnamed: 9'], axis = 1, inplace = True)

replace_space = re.compile('[/(){}\[\]\|@,;]')
bad_symbols = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def preprocess(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = replace_space.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = bad_symbols.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text 

dataset['Story'] = dataset['Story'].apply(preprocess)
test['Story'] = test['Story'].apply(preprocess)

data_train_list = dataset['Story'].tolist()
data_test_list = test['Story'].tolist()

all_data = data_train_list + data_test_list



tt = WordPunctTokenizer()
count_vect=CountVectorizer(tokenizer = tt.tokenize ,stop_words ='english', ngram_range = (1,3))
count_vect.fit(all_data)
train_countvect = count_vect.transform(dataset.Story)
test_countvect = count_vect.transform(test.Story)

## Feature Selection
#Dimensionality reduction using LSI
svd=TruncatedSVD(n_components=50,n_iter=10,random_state=42)
svd.fit(train_countvect)
train_countvect_LSI=svd.transform(train_countvect)

svd=TruncatedSVD(n_components=50,n_iter=10,random_state=42)
svd.fit(test_countvect)
test_countvect_LSI=svd.transform(test_countvect)

#Feature extraction using TFxIDF
tf_idf = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1.0,smooth_idf=1.0,sublinear_tf=1.0,
            stop_words = 'english')
tf_idf.fit(all_data)
train_tf= tf_idf.transform(dataset.Story)
test_tf = tf_idf.transform(test.Story)

## Feature Selection
#Dimensionality reduction using LSI
svd=TruncatedSVD(n_components=50,n_iter=10,random_state=42)
svd.fit(train_tf)
train_tf_LSI=svd.transform(train_tf)

svd=TruncatedSVD(n_components=50,n_iter=10,random_state=42)
svd.fit(test_tf)
test_tf_LSI=svd.transform(test_tf)






[nltk_data] Downloading package stopwords to /home/nbuser/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/nbuser/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### SVC

In [2]:
'''
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],
             'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}  
cv = KFold(train_tf_LSI.shape[0], shuffle=True, random_state=0)  
grid = GridSearchCV(SVC(), param_grid, cv = cv, n_jobs = -1, refit = True, verbose = 3) 

# fitting the model for grid search 
grid.fit(train_tf_LSI, y) 

# print best parameter after tuning 
print(grid.best_params_) 

# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 
'''

"\n# defining parameter range \nparam_grid = {'C': [0.1, 1, 10, 100, 1000],\n             'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}  \ncv = KFold(train_tf_LSI.shape[0], shuffle=True, random_state=0)  \ngrid = GridSearchCV(SVC(), param_grid, cv = cv, n_jobs = -1, refit = True, verbose = 3) \n\n# fitting the model for grid search \ngrid.fit(train_tf_LSI, y) \n\n# print best parameter after tuning \nprint(grid.best_params_) \n\n# print how our model looks after hyper-parameter tuning \nprint(grid.best_estimator_) \n"

In [3]:
best_params = {'kernel': 'linear', 'C': 1000}
svc_tf = SVC(**best_params, class_weight='balanced', probability=True)
svc_tf.fit(train_tf_LSI, y)

SVC(C=1000, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

### Logistic Regression for CountVect

In [None]:
from sklearn.linear_model import LogisticRegression
# defining parameter range 
param_grid = {'C': [0.1, 1, 5, 10,50, 100],
             'max_iter': [100,200],
             'multi_class': ['auto', 'ovr'],
              'penalty': ['l1', 'l2']
             }  
cv = KFold(train_countvect_LSI.shape[0], shuffle=True, random_state=0)  
grid = GridSearchCV(LogisticRegression(), param_grid, cv = cv, n_jobs = -1, refit = True, verbose = 3) 

# fitting the model for grid search 
grid.fit(train_countvect_LSI, y) 

# print best parameter after tuning 
print(grid.best_params_) 

# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 

Fitting 1157 folds for each of 48 candidates, totalling 55536 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done 984 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2264 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 3096 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 4056 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 5144 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 6360 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 7704 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 9176 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 10776 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 12504 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 14227 tasks     

In [None]:
best_params = {'C': 0.9, 'max_iter': 100, 'multi_class': 'ovr', 'penalty': 'l2'}
lr = LogisticRegression(**best_params, class_weight='balanced')
lr.fit(train_countvect_LSI, y)

### Logistic Regression for TFIDF

In [None]:
# defining parameter range 
param_grid = {'C': [1, 5, 10, 100, 250, 500, 1000],
             'max_iter': [50,100,200],
             'multi_class': ['auto', 'ovr', 'multinomial'],
              'penalty': ['l1', 'l2', 'elasticnet']
             }  
cv = KFold(train_tf_LSI.shape[0], shuffle=True, random_state=0)  
grid = GridSearchCV(LogisticRegression(), param_grid, cv = cv, n_jobs = -1, refit = True, verbose = 3) 

# fitting the model for grid search 
grid.fit(train_tf_LSI, y) 

# print best parameter after tuning 
print(grid.best_params_) 

# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 

In [None]:
best_params = {'C': 0.9, 'max_iter': 100, 'multi_class': 'ovr', 'penalty': 'l2'}
lr = LogisticRegression(**best_params, class_weight='balanced')
lr_tf.fit(train_tf_LSI, y)

### Multinomial Naive Bayes


In [None]:
from sklearn.naive_bayes import MultinomialNB
# defining parameter range 
param_grid = {'alpha': [0.005, 0.05, 0.1, 0.5, 5, 50]
             }  
cv = KFold(train_countvect_LSI.shape[0], shuffle=True, random_state=0)  
grid = GridSearchCV(MultinomialNB(), param_grid, cv = cv, n_jobs = -1, refit = True, verbose = 3) 

# fitting the model for grid search 
grid.fit(train_countvect_LSI, y) 

# print best parameter after tuning 
print(grid.best_params_) 

# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 

In [None]:
best_params = {'alpha': 0.5}
nb = MultinomialNB(**best_params)
nb.fit(train_countvect_LSI, y)

### Ensembling

In [None]:
from scipy.stats.mstats import mode

pred1 = svc_tf.predict_proba(test_tf_LSI)
pred2 = lr.predict_proba(test_countvect_LSI)
pred3 = lr_tf.predict_proba(test_tf_LSI)
pred4 = nb.predict_proba(test_countvect_LSI)


test_pred_prob = np.mean([pred1, pred2, pred3, pred4], axis=0)
pred = np.argmax(test_pred_prob, axis=1)


In [None]:
submission = pd.DataFrame()
submission['Index'] = index
submission['SECTION'] = pred.astype(int)
submission.to_csv("ml_prob.csv", index=None)
submission.head()