## Bibliotecas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Visualizar dataset

In [2]:
resultado = pd.read_csv('50_Startups.csv')
resultado.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


## Separar dataset en rasgos y clases

In [3]:
x = resultado.iloc[:, :-1].values
y = resultado.iloc[:, 4].values


## Convertir datos categóricos a numéricos

In [4]:
# ----------------| Para varias columnas |----------------
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# --> Transformar varias columnas
ct = ColumnTransformer(
    # Columnas para transformar
    [('one_hot_encoder', OneHotEncoder(), [3])],
    # Lo que pasara con las demas columnas
    remainder="passthrough"
)

x = ct.fit_transform(x)



In [5]:
# --> Evitar la trampa de las variables ficticias
x = x[:, 1:]
print(x)


[[0.0 1.0 165349.2 136897.8 471784.1]
 [0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 1.0 131876.9 99814.71 362861.36]
 [0.0 0.0 134615.46 147198.87 127716.82]
 [1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 1.0 120542.52 148718.95 311613.29]
 [0.0 0.0 123334.88 108679.17 304981.62]
 [1.0 0.0 101913.08 110594.11 229160.95]
 [0.0 0.0 100671.96 91790.61 249744.55]
 [1.0 0.0 93863.75 127320.38 249839.44]
 [0.0 0.0 91992.39 135495.07 252664.93]
 [1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 1.0 114523.61 122616.84 261776.23]
 [0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 1.0 94657.16 145077.58 282574.31]
 [1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 1.0 86419.7 153514.11 0.0]
 [0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 1.0 78389.47 153773.43 299737.29]
 [1.0 0.0 73994.56 122782.75 303319.26]
 [1.0 0.0 67532.53 105751.03 304768.73]
 [0.0 1.0 77044.01 99281.34 140574.81]
 [0

## Conjunto de entrenamiento y prueba

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2,random_state=0)


## Modelo de regresión multiple

#### Crear y entrenar modelo

In [7]:
from sklearn.linear_model import LinearRegression

regresion = LinearRegression()
regresion.fit(x_train, y_train)

#### Algoritmo predictivo

In [8]:
y_predict = regresion.predict(x_test)

## Evaluar modelo

#### Crear modelo

In [9]:
import statsmodels.api as sm

# --> Agregar una columna de 1's
x = np.append(arr=np.ones((50,1)).astype(int), values=x, axis=1)

# --> Asignar el nivel de significancia
# Si el p-valor es mayor a este nivel, se elimina la columna
SL = 0.05



#### Función para eliminación hacia atras

In [10]:
def eliminacionAtras(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x.tolist()).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x


x_opt = x[:, [0, 1, 2, 3, 4, 5]]
eliminacionAtras(x_opt,SL)



array([[1, 165349.2],
       [1, 162597.7],
       [1, 153441.51],
       [1, 144372.41],
       [1, 142107.34],
       [1, 131876.9],
       [1, 134615.46],
       [1, 130298.13],
       [1, 120542.52],
       [1, 123334.88],
       [1, 101913.08],
       [1, 100671.96],
       [1, 93863.75],
       [1, 91992.39],
       [1, 119943.24],
       [1, 114523.61],
       [1, 78013.11],
       [1, 94657.16],
       [1, 91749.16],
       [1, 86419.7],
       [1, 76253.86],
       [1, 78389.47],
       [1, 73994.56],
       [1, 67532.53],
       [1, 77044.01],
       [1, 64664.71],
       [1, 75328.87],
       [1, 72107.6],
       [1, 66051.52],
       [1, 65605.48],
       [1, 61994.48],
       [1, 61136.38],
       [1, 63408.86],
       [1, 55493.95],
       [1, 46426.07],
       [1, 46014.02],
       [1, 28663.76],
       [1, 44069.95],
       [1, 20229.59],
       [1, 38558.51],
       [1, 28754.33],
       [1, 27892.92],
       [1, 23640.93],
       [1, 15505.73],
       [1, 22177.74],
 

#### Iteración 1

In [10]:
# --> Seleccionar las columnas que se usaran
x_opt = x[:, [0, 1, 2, 3, 4, 5]].tolist()

# --> Crear el modelo
regresion_OLS = sm.OLS(endog=y, exog=x_opt).fit()

# --> Ver los resultados
regresion_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Wed, 05 Jul 2023",Prob (F-statistic):,1.34e-27
Time:,20:48:15,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


#### Iteración 2

In [11]:
# --> Seleccionar las columnas que se usaran
x_opt = x[:, [0, 1, 3, 4, 5]].tolist()

# --> Crear el modelo
regresion_OLS = sm.OLS(endog=y, exog=x_opt).fit()

# --> Ver los resultados
regresion_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Wed, 05 Jul 2023",Prob (F-statistic):,8.49e-29
Time:,20:53:36,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


#### Iteracion 3

In [12]:
# --> Seleccionar las columnas que se usaran
x_opt = x[:, [0, 3, 4, 5]].tolist()

# --> Crear el modelo
regresion_OLS = sm.OLS(endog=y, exog=x_opt).fit()

# --> Ver los resultados
regresion_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Wed, 05 Jul 2023",Prob (F-statistic):,4.53e-30
Time:,20:57:11,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


#### Iteración 4

In [13]:
# --> Seleccionar las columnas que se usaran
x_opt = x[:, [0, 3, 5]].tolist()

# --> Crear el modelo
regresion_OLS = sm.OLS(endog=y, exog=x_opt).fit()

# --> Ver los resultados
regresion_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Wed, 05 Jul 2023",Prob (F-statistic):,2.1600000000000003e-31
Time:,21:04:15,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


#### Iteración 5

In [14]:
# --> Seleccionar las columnas que se usaran
x_opt = x[:, [0, 3]].tolist()

# --> Crear el modelo
regresion_OLS = sm.OLS(endog=y, exog=x_opt).fit()

# --> Ver los resultados
regresion_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Wed, 05 Jul 2023",Prob (F-statistic):,3.5000000000000004e-32
Time:,21:04:45,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
