# Task 1: Predictions
***

Now that we have a model that performs fairly well on all codes, we start filling in our dataset.

In [1]:
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.util import ngrams

from sklearn.externals import joblib

from sklearn.linear_model import SGDClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

>We need our created features class first.

In [2]:
class Feature_Kw(BaseEstimator, TransformerMixin):
    def __init__(self, kw, max_len):
        self.kw = kw
        self.max_len = max_len

    def fit(self, examples, y=None):
        return self

    def transform(self, examples, y=None):
        
        X = np.zeros((len(examples), len(self.kw)))
        
        tmp = []
        first_pass = True
        while self.max_len > 0:
            for i, abstract in enumerate(examples):
                token = word_tokenize(abstract)
                tmp_grams = list(ngrams(token, self.max_len))
                grams = [' '.join(i) for i in tmp_grams]
                for k in self.kw:
                    if first_pass == True:
                        if k in grams:
                            tmp.append(1)
                        else:
                            tmp.append(0)
                            
                        self.max_len -= 1
                    else:
                        if k in grams:
                            ind = self.kw.index(k)
                            tmp[ind] = 1
                            
                        self.max_len -= 1
                            
                first_pass = False
            
            X[i, :] = np.array(tmp)
        
        return X

>Let us load our dataset for which we want to make predictions for.

In [3]:
df = pd.read_csv('dataframe_pickles/original_frame_1000.csv', encoding='utf-8-sig', low_memory=False)
df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,70,71,72,73,74,75,76,77,78,79
0,0,Title,Publisher,Abstract,Keywords,JEL Codes,Author1,Author2,Author3,Author4,...,,,,,,,,,,
1,1,technical efficiency of hungarian farms before...,european association of agricultural economists,hungary is one of the ten countries that have ...,"[agribusiness, production economics]",,"zoltán bakucs, lajos","fertő, imre","fogarasi, józsef",,...,,,,,,,,,,
2,2,exploring the flexibility of polish family far...,european association of agricultural economists,no abstract is available for this item,"[farm management, productivity analysis]",,"pieniadz, agata","renner, swetlana","petrick, martin",,...,,,,,,,,,,
3,3,regional clusters in a function of rural devel...,european association of agricultural economists,as a theoretical concept rural development bas...,[community rural urban development],,"sudarić, tihana","zmaić, krunoslav","petrač, božidar",,...,,,,,,,,,,
4,4,the extended metropolitan area in a new member...,european association of agricultural economists,no abstract is available for this item,"[agricultural and food policy, community rural...",,"zolin, m. bruna",,,,...,,,,,,,,,,
5,5,infrastructural capacity of family owned holdi...,european association of agricultural economists,no abstract is available for this item,"[community rural urban development, land econo...",,"Živković, dragić","dimitrijević, bojan","jelić, sreten","rajić, zoran",...,,,,,,,,,,
6,6,comparative research of food consumption in se...,european association of agricultural economists,no abstract is available for this item,[food consumption nutrition food safety],,"Мilanović, milan r.","Đorović, milutin","stevanović, simo",,...,,,,,,,,,,
7,7,targeting agricultural and rural development m...,european association of agricultural economists,no abstract is available for this item,"[agricultural and food policy, community rural...",,"segré, andrea","rakić, renata","rokvić, gordana","vittuari, matteo",...,,,,,,,,,,
8,8,needs assessment analysis of small rural house...,european association of agricultural economists,no abstract is available for this item,[consumer household economics],,"bogdanov, natalija","moslavac, nenad",,,...,,,,,,,,,,
9,9,designing a rural development strategy for ser...,european association of agricultural economists,no abstract is available for this item,[community rural urban development],,"cochrane, nancy","reed, michael","jovanović, zlatko",,...,,,,,,,,,,


In [15]:
df_a = df[pd.isnull(df['4'])]
df_a = df_a[['0', '2', '4']]
df_a['3'] = df_a['0'] + ' ' + df_a['2']

new_df = pd.DataFrame(df_a['3'])
new_df['4'] = df_a['4']
new_df

Unnamed: 0,3,4
1,technical efficiency of hungarian farms before...,
2,exploring the flexibility of polish family far...,
3,regional clusters in a function of rural devel...,
4,the extended metropolitan area in a new member...,
5,infrastructural capacity of family owned holdi...,
6,comparative research of food consumption in se...,
7,targeting agricultural and rural development m...,
8,needs assessment analysis of small rural house...,
9,designing a rural development strategy for ser...,
10,rural development and the heritage of chayanov...,


In [16]:
values_to_predict = new_df['3'].values
values_to_predict

array([ 'technical efficiency of hungarian farms before and after accession hungary is one of the ten countries that have joined the european union eu in may 2004 hungarian farmers are now entitled to receive direct payments per ha in the frame of the single area payment scheme saps while these payments are still lower than the ones received by farmers in the eu 15 they are higher than what hungarian farmers used to receive from national pre accession budget this raises the question of whether accession to the eu has had a positive impact on farmers performance in order to contribute to this issue the paper will investigate technical efficiency of hungarian farmers between 2001 and 2005 using a panel dataset of farms while some studies have investigated other aspects of farm performance in hungary total factor productivity in 1997 by hughes 2000 profitability and total factor productivity in 2000 by davidova et al 2002 there is a clear gap regarding technical efficiency of hungary s fa

In [23]:
pipelines = ['pipeline_d.pkl', 'pipeline_e.pkl', 'pipeline_f.pkl', 'pipeline_g.pkl',
             'pipeline_h.pkl', 'pipeline_i.pkl', 'pipeline_j.pkl', 'pipeline_k.pkl',
             'pipeline_l.pkl', 'pipeline_o.pkl']

codes = ['D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'O']

codes_predict = []
first_predict = True

for p in range(len(pipelines)):
    pipe = joblib.load(pipelines[p])
    char = codes[p]
    
    prediction = pipe.predict(values_to_predict)
    
    tmp_codes_predict = []
    for i in prediction:
        if i == 1:
            tmp_codes_predict.append(char)
        else:
            tmp_codes_predict.append(np.nan)
            
    if first_predict == True:
        codes_predict = tmp_codes_predict
        first_predict = False
    else:
        for i in range(len(tmp_codes_predict)):
            if type(codes_predict[i]) == float:
                codes_predict[i] = tmp_codes_predict[i]
            else:
                if type(tmp_codes_predict[i]) != float:
                    codes_predict[i] = codes_predict[i] + ' ' + tmp_codes_predict[i]

In [24]:
new_df['4'] = codes_predict
new_df

Unnamed: 0,3,4
1,technical efficiency of hungarian farms before...,
2,exploring the flexibility of polish family far...,
3,regional clusters in a function of rural devel...,
4,the extended metropolitan area in a new member...,
5,infrastructural capacity of family owned holdi...,
6,comparative research of food consumption in se...,
7,targeting agricultural and rural development m...,
8,needs assessment analysis of small rural house...,
9,designing a rural development strategy for ser...,
10,rural development and the heritage of chayanov...,


In [25]:
#To look at predictions
df_b = new_df.dropna()
df_b

Unnamed: 0,3,4
21,development performance of agricultural sector...,O
23,public goods provision in rural area case from...,H
37,export growth of agricultural products as fact...,O
40,croatian agriculture towards world market libe...,F
57,comparative advantages in agro food trade of c...,F
76,basis risk and weather hedging effectiveness b...,G
84,estimating a production function under product...,D
88,index based compensation for weather risk in t...,G
89,willingness to pay for weather derivatives by ...,G
95,using participating and financial contracts to...,G


> We want to append our new predictions to the original frame we started with.

In [28]:
original_abstracts = list((df['0'] + ' ' + df['2']).values)
original_jels = list(df['4'].values)
new_abstracts = list(new_df['3'].values)
new_jels = list(new_df['4'].values)

In [29]:
for i in range(len(new_abstracts)):
    index = original_abstracts.index(new_abstracts[i])
    if new_jels[i] != original_jels[index]:
            original_jels[index] = new_jels[i]

In [31]:
cleaned_jels = list(np.zeros(len(original_jels)))
numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
for i in range(1, len(original_jels)):
    tmp = []
    if type(original_jels[i]) != float:
        l = list(original_jels[i])
        for j in l:
            if j not in numbers and j != '.' and j != '\t' and j.isupper() == True:
                tmp.append(j.replace(' ', ''))
                
        cleaned_jels[i] = ' '.join(list(set(tmp)))
    else:
        cleaned_jels[i] = original_jels[i]

for i in range(1, len(cleaned_jels)):
    if type(cleaned_jels[i]) != float:
        cleaned_jels[i] = cleaned_jels[i].strip(' ')

In [32]:
cleaned_jels[0] = 'Jel Codes'
df['4'] = cleaned_jels
df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,70,71,72,73,74,75,76,77,78,79
0,0,Title,Publisher,Abstract,Keywords,Jel Codes,Author1,Author2,Author3,Author4,...,,,,,,,,,,
1,1,technical efficiency of hungarian farms before...,european association of agricultural economists,hungary is one of the ten countries that have ...,"[agribusiness, production economics]",,"zoltán bakucs, lajos","fertő, imre","fogarasi, józsef",,...,,,,,,,,,,
2,2,exploring the flexibility of polish family far...,european association of agricultural economists,no abstract is available for this item,"[farm management, productivity analysis]",,"pieniadz, agata","renner, swetlana","petrick, martin",,...,,,,,,,,,,
3,3,regional clusters in a function of rural devel...,european association of agricultural economists,as a theoretical concept rural development bas...,[community rural urban development],,"sudarić, tihana","zmaić, krunoslav","petrač, božidar",,...,,,,,,,,,,
4,4,the extended metropolitan area in a new member...,european association of agricultural economists,no abstract is available for this item,"[agricultural and food policy, community rural...",,"zolin, m. bruna",,,,...,,,,,,,,,,
5,5,infrastructural capacity of family owned holdi...,european association of agricultural economists,no abstract is available for this item,"[community rural urban development, land econo...",,"Živković, dragić","dimitrijević, bojan","jelić, sreten","rajić, zoran",...,,,,,,,,,,
6,6,comparative research of food consumption in se...,european association of agricultural economists,no abstract is available for this item,[food consumption nutrition food safety],,"Мilanović, milan r.","Đorović, milutin","stevanović, simo",,...,,,,,,,,,,
7,7,targeting agricultural and rural development m...,european association of agricultural economists,no abstract is available for this item,"[agricultural and food policy, community rural...",,"segré, andrea","rakić, renata","rokvić, gordana","vittuari, matteo",...,,,,,,,,,,
8,8,needs assessment analysis of small rural house...,european association of agricultural economists,no abstract is available for this item,[consumer household economics],,"bogdanov, natalija","moslavac, nenad",,,...,,,,,,,,,,
9,9,designing a rural development strategy for ser...,european association of agricultural economists,no abstract is available for this item,[community rural urban development],,"cochrane, nancy","reed, michael","jovanović, zlatko",,...,,,,,,,,,,


In [33]:
df.to_csv('cleaned_jels.csv', header=False, index=False, encoding='utf-8-sig')