# Task 1: Predictions
***

Now that we have a model that performs fairly well on all codes, we start filling in our dataset.

In [1]:
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.util import ngrams

from sklearn.externals import joblib

from sklearn.linear_model import SGDClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

>We need our created features class first.

In [2]:
class Feature_Kw(BaseEstimator, TransformerMixin):
    def __init__(self, kw, max_len):
        self.kw = kw
        self.max_len = max_len

    def fit(self, examples, y=None):
        return self

    def transform(self, examples, y=None):
        
        X = np.zeros((len(examples), len(self.kw)))
        
        tmp = []
        first_pass = True
        while self.max_len > 0:
            for i, abstract in enumerate(examples):
                token = word_tokenize(abstract)
                tmp_grams = list(ngrams(token, self.max_len))
                grams = [' '.join(i) for i in tmp_grams]
                for k in self.kw:
                    if first_pass == True:
                        if k in grams:
                            tmp.append(1)
                        else:
                            tmp.append(0)
                            
                        self.max_len -= 1
                    else:
                        if k in grams:
                            ind = self.kw.index(k)
                            tmp[ind] = 1
                            
                        self.max_len -= 1
                            
                first_pass = False
            
            X[i, :] = np.array(tmp)
        
        return X

>Let us load our dataset for which we want to make predictions for.

In [3]:
df = pd.read_csv('dataframe_pickles/original_frame_500.csv', encoding='utf-8-sig', low_memory=False)
df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,47,48,49,50,51,52,53,54,55,56
0,0,Title,Abstract,Keywords,JEL Codes,Author1,Author2,Author3,Author4,Author5,...,,,,,,,,,,
1,1,,,"[ , corporate social responsibility, , corpor...",,Дохолян Самвел Бахшиевич,,,,,...,,,,,,,,,,
2,2,,,"[ , , , , financial stability, a balanced m...",,Могзоев Александр Мушкудаевич,Шадрина Ирина Николаевна,,,,...,,,,,,,,,,
3,3,,,"[ , , , small business, state support, support]",,Руденко Людмила Геннадьевна,,,,,...,,,,,,,,,,
4,4,,,"[ , , , , , , , , ]",,Суптело Наталья Петровна,Михайлов Павел Сергеевич,,,,...,,,,,,,,,,
5,5,,,"[ , , , , , problems, statistical methodol...",,Тебекин Алексей Васильевич,Тебекин Павел Алексеевич,Кузнецова Галина Васильевна,,,...,,,,,,,,,,
6,6,,,"[ , , , , , , capital repairs, an apartme...",,Клюев Виктор Дмитриевич,Зайцев Дмитрий Анатольевич,Евсикова Юлия Владимировна,,,...,,,,,,,,,,
7,7,,35,"[ , , , , , communications, intraorganizat...",,Алексеев Александр Николаевич,,,,,...,,,,,,,,,,
8,8,,,"[ , , urban agglomeration, municipal union]",,Богославец Даниил Михайлович,,,,,...,,,,,,,,,,
9,9,,,"[ , , , , city, transport infrastructure, t...",,Зубец Антон Желькович,,,,,...,,,,,,,,,,


In [8]:
df_a = df[pd.isnull(df['3'])]
df_a = df_a[['0', '1', '3']]
df_a['1'] = df_a['0'] + ' ' + df_a['1']

new_df = pd.DataFrame(df_a['1'])
new_df['2'] = df_a['3']
new_df

Unnamed: 0,1,2
1,,
2,,
3,,
4,,
5,,
6,,
7,35,
8,,
9,,
10,,


In [10]:
values_to_predict = new_df['1'].values
values_to_predict

array(['   ', '   ', '   ', ...,
       'explaining price variability in the italian market for high quality wines explaining price variability in the italian market for high quality wines di adele coppola valeria sodano fabio verneau abstract in the last 20 years the italian wine market exhibited a dramatic change while total consumption has been falling there has been a continuos rise in the demand for high quality product the paper analyses the price quality relationships in the market for the italian high quality wines in order to better understand current strategies of product differentiation in the wine market we estimated a regression model with price as dependent variable and a set of quality attributes and reputation indicators as independent variables the empirical findings show that the higher prices are associated both with higher quality and reputation and that consumer perception of quality is affected by subjective rather than objective quality attributes ',
       'il c

In [11]:
pipelines = ['pipeline_d.pkl', 'pipeline_e.pkl', 'pipeline_f.pkl', 'pipeline_g.pkl',
             'pipeline_h.pkl', 'pipeline_i.pkl', 'pipeline_j.pkl', 'pipeline_k.pkl',
             'pipeline_l.pkl', 'pipeline_o.pkl']

codes = ['D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'O']

codes_predict = []
first_predict = True

for p in range(len(pipelines)):
    pipe = joblib.load(pipelines[p])
    char = codes[p]
    
    prediction = pipe.predict(values_to_predict)
    
    tmp_codes_predict = []
    for i in prediction:
        if i == 1:
            tmp_codes_predict.append(char)
        else:
            tmp_codes_predict.append(np.nan)
            
    if first_predict == True:
        codes_predict = tmp_codes_predict
        first_predict = False
    else:
        for i in range(len(tmp_codes_predict)):
            if type(codes_predict[i]) == float:
                codes_predict[i] = tmp_codes_predict[i]
            else:
                if type(tmp_codes_predict[i]) != float:
                    codes_predict[i] = codes_predict[i] + ' ' + tmp_codes_predict[i]

In [12]:
new_df['2'] = codes_predict
new_df

Unnamed: 0,1,2
1,,
2,,
3,,
4,,
5,,
6,,
7,35,
8,,
9,,
10,,


In [13]:
#To look at predictions
df_b = new_df.dropna()
df_b

Unnamed: 0,1,2
290,export decision support model 1 2 3 17 5 19...,F
352,infrastructure development as a condition for ...,O
499,the analysis of dynamics of modern russia s in...,O
595,ipo,G
749,ipo according to the international practice ...,G
849,ipo ipo,G
1292,this paper considers the tendency of format...,O
1971,40 60 consider the problem of compensation ...,J
1972,the comparative assessment of the main appr...,O
1973,conducted a comprehensive assessment of the...,O


> We want to append our new predictions to the original frame we started with.

In [14]:
original_abstracts = list((df['0'] + ' ' + df['1']).values)
original_jels = list(df['3'].values)
new_abstracts = list(new_df['1'].values)
new_jels = list(new_df['2'].values)

In [15]:
for i in range(len(new_abstracts)):
    index = original_abstracts.index(new_abstracts[i])
    if new_jels[i] != original_jels[index]:
            original_jels[index] = new_jels[i]

In [16]:
cleaned_jels = list(np.zeros(len(original_jels)))
numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
for i in range(1, len(original_jels)):
    tmp = []
    if type(original_jels[i]) != float:
        l = list(original_jels[i])
        for j in l:
            if j not in numbers and j != '.' and j != '\t' and j.isupper() == True:
                tmp.append(j.replace(' ', ''))
                
        cleaned_jels[i] = ' '.join(list(set(tmp)))
    else:
        cleaned_jels[i] = original_jels[i]

for i in range(1, len(cleaned_jels)):
    if type(cleaned_jels[i]) != float:
        cleaned_jels[i] = cleaned_jels[i].strip(' ')

In [24]:
cleaned_jels[0] = 'Jel Codes'
df['3'] = cleaned_jels
df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,47,48,49,50,51,52,53,54,55,56
0,0,Title,Abstract,Keywords,Jel Codes,Author1,Author2,Author3,Author4,Author5,...,,,,,,,,,,
1,1,,,"[ , corporate social responsibility, , corpor...",,Дохолян Самвел Бахшиевич,,,,,...,,,,,,,,,,
2,2,,,"[ , , , , financial stability, a balanced m...",,Могзоев Александр Мушкудаевич,Шадрина Ирина Николаевна,,,,...,,,,,,,,,,
3,3,,,"[ , , , small business, state support, support]",,Руденко Людмила Геннадьевна,,,,,...,,,,,,,,,,
4,4,,,"[ , , , , , , , , ]",,Суптело Наталья Петровна,Михайлов Павел Сергеевич,,,,...,,,,,,,,,,
5,5,,,"[ , , , , , problems, statistical methodol...",,Тебекин Алексей Васильевич,Тебекин Павел Алексеевич,Кузнецова Галина Васильевна,,,...,,,,,,,,,,
6,6,,,"[ , , , , , , capital repairs, an apartme...",,Клюев Виктор Дмитриевич,Зайцев Дмитрий Анатольевич,Евсикова Юлия Владимировна,,,...,,,,,,,,,,
7,7,,35,"[ , , , , , communications, intraorganizat...",,Алексеев Александр Николаевич,,,,,...,,,,,,,,,,
8,8,,,"[ , , urban agglomeration, municipal union]",,Богославец Даниил Михайлович,,,,,...,,,,,,,,,,
9,9,,,"[ , , , , city, transport infrastructure, t...",,Зубец Антон Желькович,,,,,...,,,,,,,,,,


In [25]:
df.to_csv('cleaned_jels.csv', header=False, index=False, encoding='utf-8-sig')