In [1]:
import sklearn
import numpy as np
import matplotlib
import scipy
import pandas as pd
import scipy.stats
import seaborn as sns
from sklearn.datasets import make_regression
from matplotlib import pyplot as plt
from scipy.stats import shapiro
from scipy.stats import normaltest
%matplotlib notebook


# Visualitzarem només 3 decimals per mostra
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# Funcio per a llegir dades en format csv
def load_dataset(path):
    dataset = pd.read_csv(path, header=0, delimiter=',')
    return dataset
# Carreguem dataset d'exemple
dataset = load_dataset('COMBO17.csv')
data = dataset.values

x = data[:, :2]
y = data[:, 2]

# Per veure les dimensionalitats
print("Dimensionalitat de la BBDD:", dataset.shape)
print("Dimensionalitat de les entrades X:", x.shape)
print("Dimensionalitat de l'atribut Y:", y.shape)


# Eliminem valors inexistents
dataset = dataset.dropna()
dataset = dataset.rename(columns={"e.Mcz": "eMcz"})

#Analisis de chi2red y e.Mcz: la info del dataset indica que los valores demasiado altos corresponden
#a galaxias poco interesantes

#puede ser de interés para representar "chi2red" https://stackoverflow.com/questions/21033720/python-pandas-histogram-log-scale

hist = dataset[["chi2red", "eMcz", "ApDRmag"]].hist()
dataset[["chi2red", "eMcz", "ApDRmag"]].describe()


dataset = dataset.drop(dataset[dataset.chi2red > 5].index)
dataset = dataset.drop(dataset[dataset.eMcz > 0.2].index)
dataset = dataset.drop(dataset[dataset.ApDRmag < 0].index)

hist = dataset[["chi2red", "eMcz", "ApDRmag"]].hist()
dataset[["chi2red", "eMcz", "ApDRmag"]].describe()

lst = []

# Drop de la columna 1 que contiene el indice 
dataset = dataset.drop(columns='Nr')

# Quitamos las columnas entre la 56 y la 65 al ser valores redundantes de las 13 anteriores
lst = lst[-10:]


# Creamos una diccionario para seleccionar los nombres de columnas que queremos dropear
dataTypeDict = dict(dataset.dtypes)


for x in dataTypeDict:
    if "e" in x:
        lst.append(x)
# Drop de las columnas que den info acerca de los errores        
dataset = dataset.drop(columns = lst)
print("Total de valors no existents:", dataset.isnull().sum().sum())


Dimensionalitat de la BBDD: (3462, 65)
Dimensionalitat de les entrades X: (3462, 2)
Dimensionalitat de l'atribut Y: (3462,)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Total de valors no existents: 0


In [2]:
# Mirem la correlació entre els atributs d'entrada per entendre millor les dades
correlacio = dataset.corr()


plt.figure(figsize=(16, 5))
ax = sns.heatmap(correlacio, annot=True, linewidths=.5)

<IPython.core.display.Javascript object>

In [3]:
#en el dataset se habla de la correlación entre Rmag y mumax para sacar los tamaños de las galaxias
#si bien es cierto que nos encontramos que cierto grupo de atributos son muc correlativos 
#(el cuadrado ese que se ve, decir las columnas), hay que tener en cuenta que son datos muy parecidos/relacionados (bandas 
#cercanas en la medición), por lo que no se saca mucho de ahí

#comprobamos la correlación entre estos dos datos mediante gráficos

#relacio = sns.pairplot(dataset[["Rmag", "ApDRmag", "Mcz", "mumax"]])

rel_Rmag_mumax = dataset[["Rmag", "ApDRmag", "Mcz", "mumax"]]
print(rel_Rmag_mumax.size)
relacio = sns.pairplot(rel_Rmag_mumax)

rel_Rmag_mumax = dataset[["Rmag", "ApDRmag", "mumax"]]
relacio = sns.pairplot(rel_Rmag_mumax)


4336


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
import numpy
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn import linear_model


In [5]:
from sklearn.pipeline import make_pipeline




In [6]:
poly_model = make_pipeline(PolynomialFeatures(degree=3), linear_model.LinearRegression())
poly_model.fit(x_train.reshape(-1, 1), y_train.reshape(-1, 1))

linear_model_1 = linear_model.LinearRegression()
linear_model_1.fit(x_train, y_train)

fig = plt.figure()
ax = plt.axes()

x_test.sort(axis=0)

ax.set(xlabel='X', ylabel='Y', title='X vs Y')
ax.scatter(x,y, alpha=0.5, cmap='viridis')
#ax.plot(x_test, linear_model_1.predict(x_test), color='green', label='linear')
ax.plot(x_test, poly_model.predict(x_test), color='red', label='poly')


NameError: name 'x_train' is not defined

In [None]:

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

x = ds[["ApDRmag"]]
y = ds[["mumax"]]

test_size = 0.33

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size)

x_train = x_train.values
y_train = y_train.values

x_test = x_test.values
y_test = y_test.values

x = ds[["ApDRmag"]].values
y = ds[["mumax"]].values


poly_model = make_pipeline(PolynomialFeatures(degree=4), linear_model.LinearRegression())
poly_model.fit(x_train.reshape(-1, 1), y_train.reshape(-1, 1))

linear_model_1 = linear_model.LinearRegression()
linear_model_1.fit(x_train, y_train)

fig = plt.figure()
ax = plt.axes()

x_test.sort(axis=0)

ax.set(xlabel='X', ylabel='Y', title='X vs Y')
ax.scatter(x,y, alpha=0.5, cmap='viridis')
#ax.plot(x_test, linear_model_1.predict(x_test), color='green', label='linear')
ax.plot(x_test, poly_model.predict(x_test), color='red', label='poly')

print(poly_model.predict(x_test).reshape(-1, 1).shape)
print(y_test.reshape(-1, 1).shape)

print(mean_squared_error(poly_model.predict(x_test),y_test))
print(r2_score(poly_model.predict(x_test), y_test))


In [12]:


def mean_squared_error(y1, y2):
    # comprovem que y1 i y2 tenen la mateixa mida
    assert(len(y1) == len(y2))
    mse = np.sum((y1-y2)**2)
    return mse / len(y1)


x = dataset[["ApDRmag"]]
y = dataset[["mumax"]]


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

x_train = x_train.values
y_train = y_train.values

x_test = x_test.values
y_test = y_test.values

x = x.values
y = y.values


poly_model = make_pipeline(PolynomialFeatures(degree=5
                                             ), linear_model.LinearRegression())
poly_model.fit(x_train.reshape(-1, 1), y_train.reshape(-1, 1))

linear_model_1 = linear_model.LinearRegression()
linear_model_1.fit(x_train, y_train)

fig = plt.figure()
ax = plt.axes()

x_test.sort(axis=0)

ax.set(xlabel='X', ylabel='Y', title='X vs Y')
ax.scatter(x,y, alpha=0.5, cmap='viridis')
ax.plot(x_test, linear_model_1.predict(x_test), color='green', label='linear')
ax.plot(x_test, poly_model.predict(x_test), color='red', label='poly')


print("MSE:", mean_squared_error(poly_model.predict(x_test),y_test))
print("R^2:", r2_score(poly_model.predict(x_test), y_test))


    

<IPython.core.display.Javascript object>

MSE: 0.4556020709957011
R^2: -4.542019578265055
