In [1]:
import json
import string
import pickle
import numpy as np
from joblib import dump
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

### Mutinomial Naive Bayes

In [2]:
#getting the data in shape
def getData(file_path, key):
    strings = []
    with open(file_path, 'r') as f:
        for idx_line, line in enumerate(f.readlines()):
            strings.append(json.loads(line)[key])

    return strings


def getFinalModelParams(X, Y, model):

    text_pipeline = Pipeline([('vectorizer', CountVectorizer()),
                              ('tfidf_transformator', TfidfTransformer()),
                              ('mod', model),
                             ])

    pipeline_parameters = {
        'vectorizer__analyzer': ['char_wb', 'word'],
        'vectorizer__ngram_range': [(2, 5)],
        'tfidf_transformator__use_idf': [True],
        'mod__alpha': [1e-3],
    }

    grid_search_clf = GridSearchCV(text_pipeline, pipeline_parameters, verbose=5)
    return grid_search_clf.fit(X, Y)

In [2]:
def saveModel(model):
    filename = "final_model.pth"
    pickle.dump(model, open(filename, 'wb'))
    print("Saved Model!")

In [4]:
X = np.array(getData('train_X_languages_homework.json.txt', 'text'))
Y = np.array(getData('train_y_languages_homework.json.txt', 'classification'))

#NB for sanity check
model = MultinomialNB()

## Get params of best model
model_gridS = getFinalModelParams(X, Y, model)
print('GridSearch found best score as: {} with params {}'.format(model_gridS.best_score_, model_gridS.best_params_))

## Save model
final_model = model_gridS.best_estimator_
saveModel(final_model)

## Save result
open('performance.txt', 'w+').write('GridSearch found best score as: {} with params {}'.format(model_gridS.best_score_, model_gridS.best_params_))



Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] mod__alpha=0.001, tfidf_transformator__use_idf=True, vectorizer__analyzer=char_wb, vectorizer__ngram_range=(2, 5) 
[CV]  mod__alpha=0.001, tfidf_transformator__use_idf=True, vectorizer__analyzer=char_wb, vectorizer__ngram_range=(2, 5), score=0.8388531528730959, total= 9.8min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 12.8min remaining:    0.0s


[CV] mod__alpha=0.001, tfidf_transformator__use_idf=True, vectorizer__analyzer=char_wb, vectorizer__ngram_range=(2, 5) 
[CV]  mod__alpha=0.001, tfidf_transformator__use_idf=True, vectorizer__analyzer=char_wb, vectorizer__ngram_range=(2, 5), score=0.8403636798660127, total= 6.4min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 22.4min remaining:    0.0s


[CV] mod__alpha=0.001, tfidf_transformator__use_idf=True, vectorizer__analyzer=char_wb, vectorizer__ngram_range=(2, 5) 
[CV]  mod__alpha=0.001, tfidf_transformator__use_idf=True, vectorizer__analyzer=char_wb, vectorizer__ngram_range=(2, 5), score=0.8434305957823683, total= 7.0min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 32.7min remaining:    0.0s


[CV] mod__alpha=0.001, tfidf_transformator__use_idf=True, vectorizer__analyzer=word, vectorizer__ngram_range=(2, 5) 
[CV]  mod__alpha=0.001, tfidf_transformator__use_idf=True, vectorizer__analyzer=word, vectorizer__ngram_range=(2, 5), score=0.4356393027092948, total= 6.6min


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 41.0min remaining:    0.0s


[CV] mod__alpha=0.001, tfidf_transformator__use_idf=True, vectorizer__analyzer=word, vectorizer__ngram_range=(2, 5) 
[CV]  mod__alpha=0.001, tfidf_transformator__use_idf=True, vectorizer__analyzer=word, vectorizer__ngram_range=(2, 5), score=0.4419667424333054, total= 7.0min
[CV] mod__alpha=0.001, tfidf_transformator__use_idf=True, vectorizer__analyzer=word, vectorizer__ngram_range=(2, 5) 
[CV]  mod__alpha=0.001, tfidf_transformator__use_idf=True, vectorizer__analyzer=word, vectorizer__ngram_range=(2, 5), score=0.44246834382554995, total= 6.0min


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 58.2min finished


GridSearch found best score as: 0.8408817219736664 with params {'mod__alpha': 0.001, 'tfidf_transformator__use_idf': True, 'vectorizer__analyzer': 'char_wb', 'vectorizer__ngram_range': (2, 5)}
Saved Model!


192

In [4]:
def getData(file_path, key):
    strings = []
    with open(file_path, 'r') as f:
        for idx_line, line in enumerate(f.readlines()):
            strings.append(json.loads(line)[key])

    return strings

def getFinalModelParams(X, Y, model):

    text_pipeline = Pipeline([('vectorizer', CountVectorizer()),
                              ('tfidf_transformator', TfidfTransformer()),
                              ('mod', model),
                             ])

    pipeline_parameters = {
        'vectorizer__analyzer': ['char_wb', 'word'],
        'vectorizer__ngram_range': [(2, 5)],
        'tfidf_transformator__use_idf': [True],
    }

    grid_search_clf = GridSearchCV(text_pipeline, pipeline_parameters, verbose=5)
    return grid_search_clf.fit(X, Y)

### Linear Regression
Constant Memory Error, and Colab wasn't helpful as well.<p>
Moving on to making predictions.

In [5]:
X = np.array(getData('train_X_languages_homework.json.txt', 'text'))
Y = np.array(getData('train_y_languages_homework.json.txt', 'classification'))

#Trying LR
model = LogisticRegression(solver='lbfgs', multi_class='multinomial')

## Get params of best model
model_gridS = getFinalModelParams(X, Y, model)
print('GridSearch found best score as: {} with params {}'.format(model_gridS.best_score_, model_gridS.best_params_))

## Save result
open('performance.txt', 'w+').write('GridSearch found best score as: {} with params {}'.format(model_gridS.best_score_, model_gridS.best_params_))

## Save model
final_model = model_gridS.best_estimator_
saveModel(final_model)



Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] tfidf_transformator__use_idf=True, vectorizer__analyzer=char_wb, vectorizer__ngram_range=(2, 5) 




MemoryError: 

### Make Predictions

In [5]:
def outputData(filename, datas):
    with open(filename, 'w+') as f:
        for data in datas:
            f.write('{}\n'.format(data))

In [6]:
filename = "final_model.pth"
trained_model = pickle.load(open(filename, 'rb'))
#trained_model = model
predictions = trained_model.predict(X)

outputData('predictions.txt', predictions)