In [1]:
import pandas as pd
import numpy as np
import sys
from scipy import sparse
import readability as r
import en_core_web_lg
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
Vocab=pd.read_csv('unigram_freq.csv')
from gensim.models import KeyedVectors
from tqdm import tqdm
from nltk.tokenize import word_tokenize 

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
#X_train,X_test,y_train,y_test=train_test_split(train['excerpt'],train['target'])

tfidf=TfidfVectorizer(ngram_range=(1,2))
TfIdf_train=tfidf.fit_transform(train['excerpt'])
TfIdf_test=tfidf.transform(test['excerpt'])
y_train=train['target']

In [3]:
def Readabiity_features(document): #Readability Features 
    Feature_list=[]
    Readability_doc=r.getmeasures(document,lang='en')
    l=[Readability_doc['sentence info'][i] for i in Readability_doc['sentence info'].keys()]+[Readability_doc['readability grades'][i] for i in Readability_doc['readability grades'].keys()]+[Readability_doc['word usage'][i] for i in Readability_doc['word usage'].keys()]+[Readability_doc['sentence beginnings'][i] for i in Readability_doc['sentence beginnings'].keys()]
    return l
readability=pd.DataFrame(*zip([train['excerpt'].apply(lambda x : Readabiity_features(x))])).T
readability_train = readability.excerpt.apply(pd.Series)
readability=pd.DataFrame(*zip([test['excerpt'].apply(lambda x : Readabiity_features(x))])).T
readability_test = readability.excerpt.apply(pd.Series)

In [4]:
def Frequency_Feature(data): #word Frequency feature
    X_VocabfreqCount=[]
    for j in tqdm(data['excerpt']):
        x=[Vocab_dict[word] for word in j.split() if word in Vocab_dict]
        x=np.mean(np.log(x))
        X_VocabfreqCount.append(x)
    return X_VocabfreqCount
Vocab_dict=dict(zip(Vocab.word, Vocab['count']))
FrequencyOfWordsFeature_train=Frequency_Feature(train)
FrequencyOfWordsFeature_test=Frequency_Feature(test)

100%|████████████████████████████████████████████████████████████████████████████| 2834/2834 [00:00<00:00, 7791.09it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 2334.24it/s]


In [5]:
def sentenceToVecFeature(sentence,embeddings_index): #Glove Vector 300d 
    words = str(sentence).lower()
    words = word_tokenize(words)
    words = [w for w in words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

embeddings_index = {}
f = open('glove.6B.300d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except ValueError:
        pass
f.close()

GloveVectorsTrain = [sentenceToVecFeature(x,embeddings_index) for x in tqdm(train['excerpt'])]
GloveVectorsTest = [sentenceToVecFeature(x,embeddings_index) for x in tqdm(test['excerpt'])]

400000it [00:59, 6761.89it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2834/2834 [00:07<00:00, 372.25it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 427.24it/s]


In [6]:
def W2v_Features(data,model):      #Word2Vec Features 300d
    embedding_matrix = np.zeros((len(data), 300))

    for i,words in tqdm(enumerate(data['excerpt'])):
        vector=[]
        for w in words:
            try:
                vector.append(model[w])
            except :
                pass
        embedding_matrix[i]=np.mean(vector , axis=0)
    return embedding_matrix
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
W2v_Train=W2v_Features(train,model)
W2v_test=W2v_Features(test,model)

2834it [00:07, 404.39it/s]
7it [00:00, 947.38it/s]


In [7]:
def SpacyWordVectorEmbedding(data,nlp): #Spacy Vector 300d Citation : https://spacy.io/models
    SpacyVector=[]
    for j,i in tqdm(enumerate(data['excerpt'])):
        SpacyVector.append(nlp(i).vector)
    return SpacyVector
nlp = en_core_web_lg.load()
SpacyTrainEmbedding=SpacyWordVectorEmbedding(train,nlp)
SpacyTestEmbedding=SpacyWordVectorEmbedding(test,nlp)

2834it [02:09, 21.90it/s]
7it [00:00, 18.63it/s]


In [8]:
X_trainFinal=sparse.hstack([sparse.csr_matrix(readability_train.values),W2v_Train,np.array(SpacyTrainEmbedding),GloveVectorsTrain,sparse.csr_matrix(TfIdf_train)]).tocsr()
X_testFinal=sparse.hstack([sparse.csr_matrix(readability_test.values),W2v_test,np.array(SpacyTestEmbedding),GloveVectorsTest,sparse.csr_matrix(TfIdf_test)]).tocsr()
#del readability_train,readability_test,TfIdf_train,TfIdf_test,SpacyTrainEmbedding,SpacyTestEmbedding,GloveVectorsTrain,GloveVectorsTest,W2v_Train,W2v_test

In [9]:
from sklearn.decomposition import TruncatedSVD
SVD=TruncatedSVD(n_components=100)
svd_train=SVD.fit_transform(X_trainFinal) #Dimentionalirty Reduction to 100 Dimentions
svd_test=SVD.transform(X_testFinal)

del SVD

X_trainFinalSVD=sparse.hstack([X_trainFinal,svd_train]) #Adding to Previous Dimentions (seems to improve result when use both orignal data and SVD of Data.)
X_testFinalSVD=sparse.hstack([X_testFinal,svd_test])

In [10]:
from sklearn.linear_model import Lasso,LinearRegression,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor
import numpy as np
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import KFold
models=[Lasso(),LinearRegression(),ElasticNet(),RandomForestRegressor(),XGBRegressor()]

In [11]:
kf = KFold(n_splits=5)
kf.get_n_splits(X_trainFinal)
dict1={'Lasso()':[],
'LinearRegression()':[],
'ElasticNet()':[],
'RandomForestRegressor()':[],
'XGBRegressor()':[]
}
dict2={'Lasso()':[],
'LinearRegression()':[],
'ElasticNet()':[],
'RandomForestRegressor()':[],
'XGBRegressor()':[]}
from tqdm import tqdm
for i in tqdm(models):
    Train_loss=[]
    CV_loss=[]
    Train_loss2=[]
    CV_loss2=[]
    for train_index, test_index in kf.split(X_trainFinal):
        i.fit(X_trainFinal[train_index],y_train[train_index])
        Train_loss.append(mean_squared_error(y_train[train_index],i.predict(X_trainFinal[train_index]),squared=False))
        CV_loss.append(mean_squared_error(y_train[test_index],i.predict(X_trainFinal[test_index]),squared=False))
        Train_loss2.append(r2_score(y_train[train_index],i.predict(X_trainFinal[train_index])))
        CV_loss2.append(r2_score(y_train[test_index],i.predict(X_trainFinal[test_index])))
    dict1[str(i)].append(min(Train_loss))
    dict1[str(i)].append(min(CV_loss))
    dict2[str(i)].append(min(Train_loss2))
    dict2[str(i)].append(min(CV_loss2))

 80%|████████████████████████████████████████████████████████████████                | 4/5 [2:53:11<43:17, 2597.78s/it]


KeyError: "XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n             importance_type='gain', interaction_constraints='',\n             learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n             min_child_weight=1, missing=nan, monotone_constraints='()',\n             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,\n             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n             tree_method='exact', validate_parameters=1, verbosity=None)"

In [12]:
dict1 #After LinearSVM C=100 RMSE

{'Lasso()': [0.8128338066865102, 0.7465851915826207],
 'LinearRegression()': [0.00043359472704933326, 0.6781987710124979],
 'ElasticNet()': [0.7861049171250712, 0.7372812381182191],
 'RandomForestRegressor()': [0.2572172801293825, 0.6764789115682801],
 'XGBRegressor()': []}

In [13]:
dict2 #After LinearSVM C=100 R2Score

{'Lasso()': [0.24570855824282145, -0.50534224714658],
 'LinearRegression()': [0.9999956886361745, -4.153706042998469],
 'ElasticNet()': [0.29993865650714735, -0.33382211753369573],
 'RandomForestRegressor()': [0.9304961475668622, 0.21223077821842518],
 'XGBRegressor()': []}

Here we observed that best baseline model with only Tfidf features only gave thre <b>RMSE of 0.7044741966281817</b> whereas <br>
With adding of all these features we get much lower <b>RMSE of 0.5956282170790118</b>

<b>Individual Performance of Features:</b><br>



<b>Format : [Train , CrossValidation] </b> <br>


<b>TFIDF</b> : 0.7044743964935553 : {RMSE : [0.3867706170086269, 0.7190667613500883] , R2Score : [0.8469583014383384, 0.17079930921263164]}<br>

<b>TFIDF + Readabiity_features : </b>{RMSE : [0.20266040825359827, 0.6712319048527144] , R2Score : [0.9560945535762808, 0.2241373509311262]}<br>

<b>TFIDF + Readability_features + Word2Vec : </b>{RMSE :
[0.20148038298156054, 0.6672371344118301] ,R2Score : [0.9565913179844012, 0.2311981965100628]}<br>

<b>TFIDF + Readability_features + Word2Vec + GloveEmbeddings : </b> {RMSE :
[0.19078572759976645, 0.6471675450749329] ,R2Score : [0.9616287548685135, 0.3396660601486968]}<br>

<b>TFIDF + Readability_features + Word2Vec + GloveEmbeddings +Word_Frequency:</b> {RMSE :
[0.19075587287527357, 0.6494942113559232] ,R2Score : [0.9616185373924281, 0.3385741755044317]}<br>


<b>TFIDF + Readability_features + Word2Vec + GloveEmbeddings +Word_Frequency + SpacywordEmbeddings:</b> {RMSE :
[0.1689978282129475, 0.5955219015511223] ,R2Score :  [0.97033740820051, 0.50624746304775]}<br>


<b>TFIDF + Readability_features + Word2Vec + GloveEmbeddings  + SpacywordEmbeddings : </b>{RMSE : [0.1689978282129475, 0.5955219015511223] , R2Score : [0.97033740820051, 0.50624746304775]}<br>


In [11]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': [3, 10, 15],'min_child_weight': np.arange(1, 10, 2),'gamma': np.arange(0, 10, 2),'n_estimators': [50, 100, 150],'learning_rate': [0.01,0.1],'subsample': np.arange(0.5, 1.0, 0.25)}
search = GridSearchCV(XGBRegressor(n_jobs = -1), param_grid, cv=5,verbose=4)

In [12]:
search.fit(X_trainFinal, y_train)

KeyboardInterrupt: 