In [70]:
def read_feature_descriptions(filename):
    names = []
    types = []
    with open(filename) as f:
        for l in f:
            if l[0] == '|' or ':' not in l:
                continue
            cols = l.split(':')
            names.append(cols[0])
            if cols[1].startswith(' continuous.'):
                types.append(float)
            else:
                types.append(str)
    return names, types

feat_names, feat_types = read_feature_descriptions('C:/Users/Nizar/Documents/Studing/Deep Learning HT19 Umeå University/Applied Machine Learning, 2019/Dataset/adult.names')

In [71]:
feat_names


['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country']

In [72]:
feat_types


[float,
 str,
 float,
 str,
 float,
 str,
 str,
 str,
 str,
 str,
 float,
 float,
 float,
 str]

In [73]:
def read_data(filename, feat_names, feat_types):
    X = []
    Y = []
    with open(filename) as f:
        for l in f:
            cols = l.strip('\n.').split(', ')
            if len(cols) < len(feat_names): # skip empty lines and comments
                continue
            X.append( { n:t(c) for n, t, c in zip(feat_names, feat_types, cols) } )
            Y.append(cols[-1])
    return X, Y

Xtrain, Ytrain = read_data('C:/Users/Nizar/Documents/Studing/Deep Learning HT19 Umeå University/Applied Machine Learning, 2019/Dataset/adult.data', feat_names, feat_types)
Xtest, Ytest = read_data('C:/Users/Nizar/Documents/Studing/Deep Learning HT19 Umeå University/Applied Machine Learning, 2019/Dataset/adult.test', feat_names, feat_types)

In [74]:
# or use the Pandas library to read the data. 
import pandas as pd

def df_to_dicts(df):
    return [ { col:x for col, x in zip(df.columns, row) } for row in df.values]

traindata = pd.read_csv('C:/Users/Nizar/Documents/Studing/Deep Learning HT19 Umeå University/Applied Machine Learning, 2019/Dataset/adult.data', header=None, sep=', ', engine='python', names=feat_names+['target'])
testdata = pd.read_csv('C:/Users/Nizar/Documents/Studing/Deep Learning HT19 Umeå University/Applied Machine Learning, 2019/Dataset/adult.test', header=None, sep=', ', engine='python', skiprows=1, names=feat_names+['target'])

Xtrain = df_to_dicts(traindata.drop('target',axis=1))
Ytrain = traindata['target']
Xtest = df_to_dicts(testdata.drop('target', axis=1))
Ytest = testdata['target'].map(lambda s: s[:-1]) # remove period character at the end



In [96]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score




### Tree-based classifiers:
from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.ensemble import RandomForestClassifier

#from sklearn.tree import DecisionTreeClassifier
## or 
# import sklearn.tree




### Neural network classifier

#from sklearn.neural_network import MLPClassifier
#from  sklearn.preprocessing import StandardScaler



### Linear classifiers:
#from sklearn.svm import LinearSVC
#from sklearn.linear_model import Perceptron
#from sklearn.linear_model import LogisticRegression

pipeline = make_pipeline(
    DictVectorizer(),
    
    
    ### Tree-based classifiers
    #RandomForestClassifier()
    #DecisionTreeClassifier()
    ## or 
    #sklearn.tree.DecisionTreeClassifier()
    GradientBoostingClassifier()

    
    ### Linear classifiers:
    #LinearSVC()
    #Perceptron()
    #LogisticRegression()
    
    
    
    ### Neural network classifier (will take longer time to train):
    
    ### The linear and neural network classifiers will perform better
    ### if you add a 
    ### sklearn.preprocessing.StandardScaler(with_mean=False)
    ### to the pipeline, after the vectorizer but before the classifier. 
    
    #StandardScaler(with_mean=False),
    #MLPClassifier()
)

In [95]:
from sklearn.model_selection import cross_validate

print(cross_validate(pipeline, Xtrain, Ytrain))



{'fit_time': array([3.32940936, 3.22801757, 3.15949607]), 'score_time': array([0.15763688, 0.15757871, 0.15658593]), 'test_score': array([0.8620785 , 0.86493459, 0.86998986])}


In [97]:
pipeline.fit(Xtrain, Ytrain)
Yguess = pipeline.predict(Xtest)
print(accuracy_score(Ytest, Yguess))

0.8712609790553406


In [100]:
import numpy as np
from sklearn.model_selection import train_test_split

alldata = np.loadtxt('C:/Users/Nizar/Documents/Studing/Deep Learning HT19 Umeå University/Applied Machine Learning, 2019/Dataset/CASP.csv', skiprows=1, delimiter=',')

Yall = alldata[:,0]
Xall = alldata[:,1:]
### Unlike the Adult dataset, this dataset does not have a
### standard train/test split. We use the scikit-learn function
### train_test_split to split the data into 80% for training 
### and 20% for testing. 

Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xall, Yall, train_size=0.8)

#print(Xtrain[])

[[1.47044000e+04 4.83268000e+03 3.28650000e-01 2.22950000e+02
  2.02666469e+06 2.57585000e+02 5.34750000e+03 2.34000000e+02
  2.43080000e+01]
 [9.38740000e+03 2.15629000e+03 2.29700000e-01 1.10018000e+02
  1.30353141e+06 1.35064000e+02 4.19480000e+03 5.90000000e+01
  3.38637000e+01]
 [7.55055000e+03 1.93553000e+03 2.56340000e-01 8.31325000e+01
  1.05843401e+06 1.23905000e+02 3.09674000e+03 5.00000000e+00
  3.76889000e+01]
 [8.25861000e+03 2.35777000e+03 2.85490000e-01 9.05637000e+01
  1.16747470e+06 1.26941000e+02 3.96787000e+03 1.65000000e+02
  3.40915000e+01]]


In [110]:
###  alternative solution that uses Pandas instead of NumPy
import pandas as pd
alldata = pd.read_csv('C:/Users/Nizar/Documents/Studing/Deep Learning HT19 Umeå University/Applied Machine Learning, 2019/Dataset/CASP.csv')
Yall = alldata['RMSD']
Xall = alldata.drop('RMSD', axis=1)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xall, Yall, train_size=0.8)
#print(Xtrain)

In [132]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score

from  sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from  sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor



pipeline = make_pipeline(
    StandardScaler(with_mean=False),
    
    #LinearRegression()
    #Ridge()
    #Lasso()
    #DecisionTreeRegressor()
    #RandomForestRegressor()
    #GradientBoostingRegressor()
    MLPRegressor()
)
cross_validate(pipeline, Xtrain, Ytrain, scoring='neg_mean_squared_error')



{'fit_time': array([14.92210746, 16.07499266, 16.39915943]),
 'score_time': array([0.0229373 , 0.02991819, 0.02692723]),
 'test_score': array([-21.95707188, -21.60302749, -21.51530901])}

In [133]:
from sklearn.metrics import mean_squared_error
  
pipeline.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, pipeline.predict(Xtest)) 



20.67512344670125

In [134]:
pwd


'C:\\Users\\Nizar'