In [None]:
import os
import re
import sklearn
import numpy as np
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor, BaggingClassifier, AdaBoostClassifier, VotingClassifier, VotingRegressor
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import sparse
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import BernoulliNB
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import Doc2Vec

In [None]:
!pip install pymagnitude

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymagnitude
  Downloading pymagnitude-0.1.143.tar.gz (5.4 MB)
[K     |████████████████████████████████| 5.4 MB 12.4 MB/s 
[?25hBuilding wheels for collected packages: pymagnitude
  Building wheel for pymagnitude (setup.py) ... [?25l[?25hdone
  Created wheel for pymagnitude: filename=pymagnitude-0.1.143-cp37-cp37m-linux_x86_64.whl size=360429840 sha256=4065aabcd77258a0ce399195a561eaf1f62e3e93ad5abd490b9e9af4a062361a
  Stored in directory: /root/.cache/pip/wheels/0e/96/d6/b765a1ce34517c193d764b634b1ff7db5e1dcfea2520f17273
Successfully built pymagnitude
Installing collected packages: pymagnitude
Successfully installed pymagnitude-0.1.143


In [None]:
!curl -s http://magnitude.plasticity.ai/glove+subword/glove.6B.100d.magnitude --output glovevectors.magnitude

In [None]:

#!curl -s http://magnitude.plasticity.ai/glove+subword/glove.6B.50d.magnitude --output vectors.magnitude
!curl -s http://magnitude.plasticity.ai/word2vec+subword/GoogleNews-vectors-negative300.magnitude --output word2vec.magnitude
!curl -s http://magnitude.plasticity.ai/fasttext+subword/wiki-news-300d-1M.magnitude --output fasttext.magnitude

!curl -s http://magnitude.plasticity.ai/glove+subword/glove.6B.100d.magnitude --output glovevectors.magnitude

In [None]:
df = pd.read_csv('/content/drive/MyDrive/train108.csv').sample(frac=1)
docs = df['doc_text']
y = df['labels']
X = docs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def get_results(X,y):
    values = {}
    scoring = {
        'acc':'accuracy',
        'f1':'f1_macro'
    }
    clf = MLPClassifier(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(512, ))
    res = cross_validate(clf, X, y, cv=10,  return_train_score=True, scoring=scoring) 
    acc = res['test_acc']
    f1 = res['test_f1']
    
    print()
    values['mlp_acc'] = acc.mean()
    values['mlp_f1'] = f1.mean()
    print("MLP")
    print("Accuracy: %f" %(acc.mean()))
    print("F1 Score: %f" %(f1.mean()))

    clf = BernoulliNB()
    res = cross_validate(clf, X, y, cv=10,  return_train_score=True, scoring=scoring)  
    acc = res['test_acc']
    f1 = res['test_f1']
    
    print()
    values['nb_acc'] = acc.mean()
    values['nb_f1'] = f1.mean()
    print("Naive Bayes")
    print("Accuracy: %f" %(acc.mean()))
    print("F1 Score: %f" %(f1.mean()))

    
    clf = LogisticRegression(max_iter=1000)
    res = cross_validate(clf, X, y, cv=10,  return_train_score=True, scoring=scoring) 
    acc = res['test_acc']
    f1 = res['test_f1']
    print()
    values['lr_acc'] = acc.mean()
    values['lr_f1'] = f1.mean()
    print("Logistic Regression")
    print("Accuracy: %f" %(acc.mean()))
    print("F1 Score: %f" %(f1.mean()))

    
    clf = svm.SVC(kernel='poly', degree=2)
    res = cross_validate(clf, X, y, cv=10,  return_train_score=True, scoring=scoring)  
    acc = res['test_acc']
    f1 = res['test_f1']
    print()
    values['svm_acc'] = acc.mean()
    values['svm_f1'] = f1.mean()
    print("SVM")
    print("Accuracy: %f" %(acc.mean()))
    print("F1 Score: %f" %(f1.mean()))
    
    clf = RandomForestClassifier(n_estimators=1000,n_jobs=-1)
    res = cross_validate(clf, X, y, cv=10,  return_train_score=True, scoring=scoring)  
    acc = res['test_acc']
    f1 = res['test_f1']
    print()
    values['rf_acc'] = acc.mean()
    values['rf_f1'] = f1.mean()
    print("Random Forest")
    print("Accuracy: %f" %(acc.mean()))
    print("F1 Score: %f" %(f1.mean()))
    print()
    print()
    return values
    
    

In [None]:
def get_reg_results(X, y):
    regr = LinearRegression(normalize=True)
    scores = cross_val_score(regr, X, y, scoring='neg_root_mean_squared_error', cv=10)
    print("linear regression RMSE:",-scores.mean())
    print()
    
    regr = RandomForestRegressor()
    scores = cross_val_score(regr, X, y, scoring='neg_root_mean_squared_error', cv=10)
    print("Random Forest regressor RMSE:",-scores.mean())
    print()
    
    
    from sklearn import svm
    regr = svm.SVR() 
    scores = cross_val_score(regr, X, y, scoring='neg_root_mean_squared_error', cv=10)
    print("SVM RMSE:",-scores.mean())
    print()
    
    regr = MLPRegressor(max_iter=10000,learning_rate='constant')
    scores = cross_val_score(regr, X, y, scoring='neg_root_mean_squared_error', cv=10)
    print("MLP RMSE:",-scores.mean())
    print()

In [None]:
#word2vec
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *

#word2vec
word2vec = Magnitude("/content/word2vec.magnitude")
def avg_w2v(x):
    vectors = []
    for title in tqdm_notebook(x):
        print(title)
        print("\n")
        
        quer=word2vec.query(word_tokenize(title))
        print(quer)
        print(quer.shape)

        vectors.append(np.average(word2vec.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)
X_1=['okay.mother is drying the dishes but the water is going out over the sink onto the floor.ah it s a pretty day outside.lots of flowers.there are three dishes left to wash dry I guess.she s standing in the water.looks dangerous.the children i are getting into the cookie jar.the boy s up there on the stool.and that s almost falling over.he s got one cookie in his hand.he s handing it to the little girl.and he s getting another one out_of the cookie jar cupboard..uh the little girl is kinda kind_of has her hand up to her mouth.looks like she s trying to eat it or be quiet or she s laughing laughs.the mother doesn t seem to be paying much attention.she looks like she s looking out the window.the water is splashing onto the floor.that s it']
x = avg_w2v(X_1)
#get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


  0%|          | 0/1 [00:00<?, ?it/s]

okay.mother is drying the dishes but the water is going out over the sink onto the floor.ah it s a pretty day outside.lots of flowers.there are three dishes left to wash dry I guess.she s standing in the water.looks dangerous.the children i are getting into the cookie jar.the boy s up there on the stool.and that s almost falling over.he s got one cookie in his hand.he s handing it to the little girl.and he s getting another one out_of the cookie jar cupboard..uh the little girl is kinda kind_of has her hand up to her mouth.looks like she s trying to eat it or be quiet or she s laughing laughs.the mother doesn t seem to be paying much attention.she looks like she s looking out the window.the water is splashing onto the floor.that s it


[[ 0.04464937 -0.08627193  0.02126178 ...  0.00840676  0.01131843
  -0.01657936]
 [ 0.003746   -0.0389198   0.0913317  ...  0.0059677   0.0871803
   0.0568229 ]
 [-0.0186125   0.1136832  -0.0525167  ... -0.0069121   0.0096152
   0.0108895 ]
 ...
 [ 0.009

In [None]:
print(x)

[[ 1.35899512e-02  2.58842893e-02  1.88410822e-02  4.71491814e-02
  -2.73571406e-02 -1.30903283e-02  1.87956765e-02 -4.16561402e-02
   1.57409497e-02  3.96436602e-02 -2.31844578e-02 -5.29324897e-02
  -1.04440767e-02  6.62566908e-03 -4.05625552e-02  1.44594042e-02
   1.25243189e-02  3.98486853e-02 -2.85680196e-03 -7.91090447e-03
  -1.58000551e-02  4.69923057e-02  1.33664133e-02 -1.23339938e-02
   1.37921236e-02 -1.63893569e-02 -3.85858454e-02  2.91990396e-02
   1.07616317e-02  1.08298454e-02 -9.31420736e-03  1.34506561e-02
  -2.58576740e-02 -8.09577480e-03 -2.06301454e-03  3.23132286e-03
  -9.99414269e-03  4.59116371e-03  1.96400788e-02  2.80177314e-02
   1.12036821e-02 -4.08315919e-02  4.73221652e-02 -1.21257333e-02
  -7.24186539e-04 -2.77760671e-03 -2.68366244e-02  1.62926111e-02
   3.39698307e-02  1.42309358e-02 -5.23304753e-03  3.43776643e-02
   2.85066641e-03 -5.43692010e-03 -5.83212008e-04  3.35445181e-02
  -6.45789551e-03 -1.58701446e-02  2.30063312e-02 -1.87392235e-02
  -1.87485

In [None]:
print(x.shape)

(1, 300)


In [None]:
accdelta=l.score(TDX,TDY)
print("Accuracy using only delta features: ", accdelta)

In [None]:
#word2vec
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *

#word2vec
word2vec = Magnitude("/content/word2vec.magnitude")
def avg_w2v(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(word2vec.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)
x = avg_w2v(X)
get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


  0%|          | 0/108 [00:00<?, ?it/s]


MLP
Accuracy: 0.813636
F1 Score: 0.809390

Naive Bayes
Accuracy: 0.786364
F1 Score: 0.781425

Logistic Regression
Accuracy: 0.805455
F1 Score: 0.802110

SVM
Accuracy: 0.822727
F1 Score: 0.819262

Random Forest
Accuracy: 0.824545
F1 Score: 0.819823




{'lr_acc': 0.8054545454545454,
 'lr_f1': 0.8021095571095571,
 'mlp_acc': 0.8136363636363635,
 'mlp_f1': 0.8093900543900544,
 'nb_acc': 0.7863636363636364,
 'nb_f1': 0.781424963924964,
 'rf_acc': 0.8245454545454545,
 'rf_f1': 0.8198229548229549,
 'svm_acc': 0.8227272727272726,
 'svm_f1': 0.8192618492618491}

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
    

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
get_results(x,y)


MLP
Accuracy: 0.860909
F1 Score: 0.852435

Naive Bayes
Accuracy: 0.798182
F1 Score: 0.790440

Logistic Regression
Accuracy: 0.780000
F1 Score: 0.776066

SVM
Accuracy: 0.834545
F1 Score: 0.828781

Random Forest
Accuracy: 0.779091
F1 Score: 0.776300




{'lr_acc': 0.78,
 'lr_f1': 0.7760664335664336,
 'mlp_acc': 0.860909090909091,
 'mlp_f1': 0.85243450993451,
 'nb_acc': 0.7981818181818182,
 'nb_f1': 0.7904395604395604,
 'rf_acc': 0.7790909090909091,
 'rf_f1': 0.7762995337995338,
 'svm_acc': 0.8345454545454544,
 'svm_f1': 0.8287814962814963}

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Download GloVe vectors
!curl -s http://magnitude.plasticity.ai/glove+subword/glove.6B.100d.magnitude --output glovevectors.magnitude

In [None]:
# Download GloVe vectors
!curl -s http://magnitude.plasticity.ai/glove+subword/glove.6B.50d.magnitude --output vectors.magnitude
!curl -s http://magnitude.plasticity.ai/word2vec+subword/GoogleNews-vectors-negative300.magnitude --output word2vec.magnitude
!curl -s http://magnitude.plasticity.ai/fasttext+subword/wiki-news-300d-1M.magnitude --output fasttext.magnitude

In [None]:
#glove
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


#glove 
glove= Magnitude("/content/glovevectors.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_glove(X)
get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


  0%|          | 0/108 [00:00<?, ?it/s]


MLP
Accuracy: 0.809091
F1 Score: 0.803002

Naive Bayes
Accuracy: 0.668182
F1 Score: 0.658215

Logistic Regression
Accuracy: 0.724545
F1 Score: 0.717134

SVM
Accuracy: 0.751818
F1 Score: 0.744292

Random Forest
Accuracy: 0.804545
F1 Score: 0.799623




{'lr_acc': 0.7245454545454545,
 'lr_f1': 0.7171342546342546,
 'mlp_acc': 0.809090909090909,
 'mlp_f1': 0.8030022755022754,
 'nb_acc': 0.6681818181818182,
 'nb_f1': 0.6582153957153958,
 'rf_acc': 0.8045454545454545,
 'rf_f1': 0.7996225996225996,
 'svm_acc': 0.7518181818181818,
 'svm_f1': 0.7442923742923743}

In [None]:
#fasttext
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *

#fasttext
ft=Magnitude("/content/fasttext.magnitude")
def avg_ft(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(ft.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_ft(X)
get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


  0%|          | 0/108 [00:00<?, ?it/s]


MLP
Accuracy: 0.880000
F1 Score: 0.878252

Naive Bayes
Accuracy: 0.834545
F1 Score: 0.828338

Logistic Regression
Accuracy: 0.780909
F1 Score: 0.776305

SVM
Accuracy: 0.780909
F1 Score: 0.774164

Random Forest
Accuracy: 0.853636
F1 Score: 0.850905




{'lr_acc': 0.780909090909091,
 'lr_f1': 0.7763053613053613,
 'mlp_acc': 0.8800000000000001,
 'mlp_f1': 0.8782517482517482,
 'nb_acc': 0.8345454545454546,
 'nb_f1': 0.8283383283383283,
 'rf_acc': 0.8536363636363637,
 'rf_f1': 0.850905205905206,
 'svm_acc': 0.780909090909091,
 'svm_f1': 0.7741641691641692}

In [None]:
df = pd.read_csv('/content/drive/MyDrive/train108mmse.csv').sample(frac=1)
docs = df['doc_text']
#y = df['mmse']
y = df['mmse']
X = docs

In [None]:
y.head()

70      14
8       30
96      19
56      19
104     24
Name: mmse, dtype: object

In [None]:
#word2vec
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("/content/word2vec.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_glove(X)

get_reg_results(x,y)



#glove
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("/content/glovevectors.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_glove(X)

get_reg_results(x,y)



#fasttext
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("/content/fasttext.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_glove(X)

get_reg_results(x,y)



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


  0%|          | 0/108 [00:00<?, ?it/s]

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

linear regression RMSE: 5.546066056799193

Random Forest regressor RMSE: 5.118151496731055

SVM RMSE: 6.959950752833249

MLP RMSE: 4.891201178549929



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/108 [00:00<?, ?it/s]

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

linear regression RMSE: 5.254127861340853

Random Forest regressor RMSE: 5.219181051506029

SVM RMSE: 6.934707339103926

MLP RMSE: 5.015089151665861



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/108 [00:00<?, ?it/s]

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

linear regression RMSE: 15.70318986524787

Random Forest regressor RMSE: 4.808194735079504

SVM RMSE: 7.352194013703311





MLP RMSE: 5.688737487504349



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/108 [00:00<?, ?it/s]

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

linear regression RMSE: 15.891636124728583

Random Forest regressor RMSE: 5.572662007082552

SVM RMSE: 7.321560309688108

MLP RMSE: 5.601360772651235



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/108 [00:00<?, ?it/s]

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

linear regression RMSE: 5.487826213106175

Random Forest regressor RMSE: 4.758020569166023

SVM RMSE: 7.466436486741007

MLP RMSE: 5.003190622591127



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/108 [00:00<?, ?it/s]

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

linear regression RMSE: 5.533346023607286

Random Forest regressor RMSE: 4.993261352562546

SVM RMSE: 7.426627481272935

MLP RMSE: 5.200097579914923



'\n#elmo\nfrom tqdm import tqdm_notebook\nfrom nltk import word_tokenize\nfrom pymagnitude import *\n\n\nglove = Magnitude("../downloads/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude")\ndef avg_glove(x):\n    vectors = []\n    for title in tqdm_notebook(x):\n        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))\n    return np.array(vectors)\n\nx = avg_glove(X)\n# x_test = avg_glove(test)\n\nget_reg_results(x,y)\n\n#elmo\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nglove = Magnitude("../downloads/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude")\n\nx = docs\n\ntfidf = TfidfVectorizer()\ntfidf.fit(x)\n# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value\nidf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))\n# Same as Avg Glove except instead of doing a regular average, we\'ll use the IDF values as weights.\ndef tfidf_glove(df):\n    vectors = []\n    for title in tqdm_notebook(df):\n   

In [None]:
X

15     okay.well the mother is drying the dishes.the ...
31     well let s see.the boy is taking cookies out_o...
46     okay.the boy is taking cookies out_of a closet...
14     all of the action.the mother is dryin g a plat...
95     well this boy is almost falling out of the off...
                             ...                        
72     the boy s uh fallin g off the stool.the the gi...
99     well the kid s standin g on a tilted stool get...
98     well little boy clears throat reachin g out fo...
101    well she s washin g dishes.he s climbin g up t...
43     well there s a kid stealin g cookies from the ...
Name: doc_text, Length: 108, dtype: object

In [None]:
traing results of various classifier using 'word2vec' embedding (10 flod crossvalidation)//do for disttil bert