<a href="https://colab.research.google.com/github/SeloPeylo/h-da-Mashine-Learning-Praktikum/blob/master/linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing and Processing of Data

In [None]:
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn.preprocessing import Imputer
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model, preprocessing
from sklearn.metrics import mean_squared_error, r2_score

#Import Data
namelist = pd.read_csv('https://gist.githubusercontent.com/naska29/4533e932eba2020f00b2dc826bfed1d0/raw/f757814b823a68324e7d8a4ff25e3fa2612a8e30/communities.namelist', header = None) #import list for the column names
datapd = pd.read_csv('https://gist.githubusercontent.com/naska29/7c031b27068e263ea71b36dbe6ae9e83/raw/693755cf101f6486bc5ed63bf412670892e25f3f/communities.data', header = None)
datapd.columns = namelist.iloc[:,0] #Naming the columns
datapd = skl.utils.shuffle(datapd) #Shuffle the Data rows before Splitting
datapd.to_csv('lab02rawdata.csv')

#Preprocessing
datay = datapd['ViolentCrimesPerPop']
datapd = datapd.replace(['?'], [np.nan])

#Top 15 features in correlation to ViolentCrimesPerPop
data = datapd #Backup Before Selecting relevant Columns
#datapd = datapd.iloc[:,5:99]

datapd = datapd[['PctIlleg',  
                'racepctblack', 
                'pctWPubAsst',
                'FemalePctDiv',
                'TotalPctDiv',
                'MalePctDivorce',
                'PctPersDenseHous',
                'NumIlleg',
                'PctHousLess3BR']]

print(datapd)
datapd = datapd.loc[:,(datapd.dtypes == np.float64)|(datapd.dtypes == np.int64)]
imp = skl.preprocessing.Imputer(missing_values=np.nan, strategy='mean', axis=0)
imp.fit(datapd)
datax = imp.transform(datapd)
datax = pd.DataFrame(datapd)
  
#print('This is Y:\n', datay)
#print('This is X:\n', datax)


## **Correlations of Features with Object**

In [None]:
#print("Correlation:", datapd.corr(datapd['ViolentCrimesPerPop']))
correlations = data[data.columns[1:]].corr()['ViolentCrimesPerPop'][:-1]
correlations = correlations.sort_values(ascending=True, axis=0)
print(correlations)

## Splits the original sample in Training-set and Test-set

In [None]:
#Splitting in Training and Test Datasets
trainrows = int(len(datax)*0.6)
print('Data Splits at row #', trainrows)
xtrain = datax.iloc[:trainrows,:]
xtest = datax.iloc[trainrows:,:]
ytrain = datay.iloc[:trainrows]
ytest = datay.iloc[trainrows:]

xtrain.to_csv('lab02xtrain.csv')
xtest.to_csv('lab02xtest.csv')
ytrain.to_csv('lab02ytrain.csv')
ytest.to_csv('lab02ytest.csv')

## Training linear regression Model

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(xtrain, ytrain)

# Make predictions using the testing set
crimes_y_pred = regr.predict(xtest)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(ytest, crimes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(ytest, crimes_y_pred), ' #Accuracy')

# Plot outputs
#plt.scatter(xtest['population'], ytest,  color='black')
#plt.plot(xtest['population'], crimes_y_pred, color='blue', linewidth=1)

## Building Model using polynomial Function

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

degrees = [1,2,4,6,8]
print(degrees)
mse_list = []
r2_list = []

for i in range(len(degrees)):
    #ax = plt.subplot(len(degrees), 1, i + 1)
    #plt.setp(ax, xticks=(), yticks=())

    polynomial_features = PolynomialFeatures(degree=degrees[i],
                                             include_bias=False)
    linear_regression = LinearRegression()
    
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    pipeline.fit(xtrain, ytrain)
    y_pred = pipeline.predict(xtest)
    
    #plt.subplot(1,2,1)
    #plt.plot(y_pred)
    
    #plt.subplot(1,2,2)
    #plt.plot(ytest)
    
    #plt.show()
    
    # Evaluate the models using crossvalidation
    #scores = cross_val_score(pipeline, xtrain[:, np.newaxis], ytrain,
    #                         scoring="neg_mean_squared_error", cv=10)

    mse_list.append(mean_squared_error(ytest, y_pred))
    r2_list.append(r2_score(ytest, y_pred))
    print('Degree of Polynomial Function', degrees[i])
    print('Mean squared Error', mse_list[i])
    print('Variance Score', r2_list[i])
    
print(mse_list)
print(r2_list)

plt.subplot(1,2,1)
plt.plot(degrees, mse_list)
plt.xlabel('Degree of Polynomial Function')
plt.ylabel('Mean Square Error')

plt.subplot(1,2,2)
plt.plot(degrees, r2_list)

  
plt.xlabel('Degree of Polynomial Function')
plt.ylabel('Variance Score')

plt.show()

## Comparing different Algorithms

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error

num_instances = len(xtrain)

models = []
models.append(('LiR', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('Bag_Re', BaggingRegressor()))
models.append(('RandomForest', RandomForestRegressor()))
models.append(('ExtraTreesRegressor', ExtraTreesRegressor()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVM', SVR()))

# Evaluations
results = []
names = []
scoring = []

for name, model in models:
    # Fit the model
    model.fit(xtrain, ytrain)
    
    predictions = model.predict(xtest)
    
    # Evaluate the model
    score = r2_score(ytest, predictions)
    mae = mean_squared_error(ytest, predictions)
    # print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    results.append(mae)
    names.append(name)
    
    msg = "%s: %f (%f)" % (name, score, mae)
    print(msg)

## Regularization

In [None]:
alphas = [0.0001, 0.001, 0.02, 0.04, 0.08, 0.16]
varScr = []
print(alphas)


for i in alphas:
  regr = linear_model.Lasso(alpha = i)
  regr.fit(xtrain, ytrain)
  y_pred = regr.predict(xtest)
  
  r2 = r2_score(ytest, y_pred)
  mse = mean_squared_error(ytest, y_pred)
  varScr.append(r2)
  
  print('Alphanr: ', i, ' Mean Squared Error: ', mse, 'Variance Score: ', r2)
  fig = plt.figure()
  
plt.plot(alphas, varScr)
plt.xlabel('Regularization Coefficient (lambda)')
plt.ylabel('Variance score (varScr)')
plt.show()





