In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
import seaborn as sns

In [8]:
import warnings
warnings.simplefilter(action='ignore')

In [5]:
hitters = pd.read_csv('../input/hitters/Hitters.csv')

In [6]:
df = hitters.copy()

In [7]:
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


> LET`S EXPLORE AND ANALYZE DATA
**We can find the structural information of the dataset by using info.

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      322 non-null    int64  
 1   Hits       322 non-null    int64  
 2   HmRun      322 non-null    int64  
 3   Runs       322 non-null    int64  
 4   RBI        322 non-null    int64  
 5   Walks      322 non-null    int64  
 6   Years      322 non-null    int64  
 7   CAtBat     322 non-null    int64  
 8   CHits      322 non-null    int64  
 9   CHmRun     322 non-null    int64  
 10  CRuns      322 non-null    int64  
 11  CRBI       322 non-null    int64  
 12  CWalks     322 non-null    int64  
 13  League     322 non-null    object 
 14  Division   322 non-null    object 
 15  PutOuts    322 non-null    int64  
 16  Assists    322 non-null    int64  
 17  Errors     322 non-null    int64  
 18  Salary     263 non-null    float64
 19  NewLeague  322 non-null    object 
dtypes: float64

In [11]:
df.dtypes

AtBat          int64
Hits           int64
HmRun          int64
Runs           int64
RBI            int64
Walks          int64
Years          int64
CAtBat         int64
CHits          int64
CHmRun         int64
CRuns          int64
CRBI           int64
CWalks         int64
League        object
Division      object
PutOuts        int64
Assists        int64
Errors         int64
Salary       float64
NewLeague     object
dtype: object

**At this point Let`s transform Object typed variables into categorical ones.If you look at the chart you will see 'League, Division and Newleague as an Object Columns.Let`s transform together**

In [13]:
df.League = pd.Categorical(df.League)
df.Division = pd.Categorical(df.Division)
df.NewLeague = pd.Categorical(df.NewLeague)

In [14]:
df.dtypes

AtBat           int64
Hits            int64
HmRun           int64
Runs            int64
RBI             int64
Walks           int64
Years           int64
CAtBat          int64
CHits           int64
CHmRun          int64
CRuns           int64
CRBI            int64
CWalks          int64
League       category
Division     category
PutOuts         int64
Assists         int64
Errors          int64
Salary        float64
NewLeague    category
dtype: object

> Let`s look at the knowledge of shape in the dataset

In [16]:
df.shape

(322, 20)

*NOW LET`S SEARCH HOW MANY MISSING OBSERVATIONS ARE THERE IN THE DATASET?*

In [17]:
df.isnull().sum()

AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64

*We just have 59 missing observations in Salary column. As an option I prefer to drop the missing observations.*

In [18]:
df = df.dropna()

In [19]:
df.isnull().sum()

AtBat        0
Hits         0
HmRun        0
Runs         0
RBI          0
Walks        0
Years        0
CAtBat       0
CHits        0
CHmRun       0
CRuns        0
CRBI         0
CWalks       0
League       0
Division     0
PutOuts      0
Assists      0
Errors       0
Salary       0
NewLeague    0
dtype: int64

In [21]:
df.shape

(263, 20)

**Let`s try to seek Descriptive Statistics**

In [22]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AtBat,263.0,403.642586,147.307209,19.0,282.5,413.0,526.0,687.0
Hits,263.0,107.828897,45.125326,1.0,71.5,103.0,141.5,238.0
HmRun,263.0,11.619772,8.757108,0.0,5.0,9.0,18.0,40.0
Runs,263.0,54.745247,25.539816,0.0,33.5,52.0,73.0,130.0
RBI,263.0,51.486692,25.882714,0.0,30.0,47.0,71.0,121.0
Walks,263.0,41.114068,21.718056,0.0,23.0,37.0,57.0,105.0
Years,263.0,7.311787,4.793616,1.0,4.0,6.0,10.0,24.0
CAtBat,263.0,2657.543726,2286.582929,19.0,842.5,1931.0,3890.5,14053.0
CHits,263.0,722.186312,648.199644,4.0,212.0,516.0,1054.0,4256.0
CHmRun,263.0,69.239544,82.197581,0.0,15.0,40.0,92.5,548.0


**REACHING THE CATEGORICAL VARIABLES IN THE HITTERS DATASET**

In [23]:
cat_df = df.select_dtypes(include = ['category'])
cat_df

Unnamed: 0,League,Division,NewLeague
1,N,W,N
2,A,W,A
3,N,E,N
4,N,E,N
5,A,W,A
...,...,...,...
317,N,E,N
318,A,E,A
319,A,W,A
320,A,E,A


**The access to permanent variables in the hitters dataset**

In [24]:
numer = df.select_dtypes(include = ['int64','float64'])
numer

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,750.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,325,9,3,700.0
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,313,381,20,875.0
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,37,113,7,385.0
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,1314,131,12,960.0


**describing of statistics the dataset **

In [26]:
numer.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AtBat,263.0,403.642586,147.307209,19.0,282.5,413.0,526.0,687.0
Hits,263.0,107.828897,45.125326,1.0,71.5,103.0,141.5,238.0
HmRun,263.0,11.619772,8.757108,0.0,5.0,9.0,18.0,40.0
Runs,263.0,54.745247,25.539816,0.0,33.5,52.0,73.0,130.0
RBI,263.0,51.486692,25.882714,0.0,30.0,47.0,71.0,121.0
Walks,263.0,41.114068,21.718056,0.0,23.0,37.0,57.0,105.0
Years,263.0,7.311787,4.793616,1.0,4.0,6.0,10.0,24.0
CAtBat,263.0,2657.543726,2286.582929,19.0,842.5,1931.0,3890.5,14053.0
CHits,263.0,722.186312,648.199644,4.0,212.0,516.0,1054.0,4256.0
CHmRun,263.0,69.239544,82.197581,0.0,15.0,40.0,92.5,548.0


**DATA PREPROCESSING

LET`S SEE ONE HOT ENCODING RESULTS IN OUR CHART
**

In [29]:
cat_df = pd.get_dummies(cat_df, columns = ['League', 'NewLeague', 'Division'], drop_first = True)
cat_df

Unnamed: 0,League_N,NewLeague_N,Division_W
1,1,1,1
2,0,0,1
3,1,1,0
4,1,1,0
5,0,0,1
...,...,...,...
317,1,1,0
318,0,0,0
319,0,0,1
320,0,0,0


> Multivariable Inconsistent Observation Analysis(LOF)

In [34]:
from sklearn.neighbors import LocalOutlierFactor

In [35]:
clf = LocalOutlierFactor(n_neighbors = 20, contamination = 0.1)
clf.fit_predict(numer)
df_scores = clf.negative_outlier_factor_
np.sort(df_scores)[0:30]

array([-6.22769103, -4.12325879, -3.14374811, -2.46483821, -2.37905101,
       -1.93272894, -1.92869115, -1.90888951, -1.67752858, -1.66537127,
       -1.6262689 , -1.48361204, -1.4821636 , -1.4434581 , -1.43603888,
       -1.42239476, -1.40734841, -1.38353108, -1.37252158, -1.35320839,
       -1.33614203, -1.31553401, -1.28358273, -1.24753827, -1.24665446,
       -1.23081072, -1.21977167, -1.21910311, -1.21892488, -1.21707071])

In [36]:
esik_deger = np.sort(df_scores)[5]
print(esik_deger)

-1.9327289436063915


In [39]:
aykiri_tf = df_scores>esik_deger
#baski_deger = numer[df_scores == esik_deger]
aykirilar = numer[~aykiri_tf]

In [40]:
aykirilar

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary
217,20,1,0,0,0,0,2,41,9,2,6,7,4,78,220,6,2127.333
236,237,52,0,15,25,30,24,14053,4256,160,2165,1314,1566,523,43,6,750.0
274,522,140,16,73,77,60,4,730,185,22,93,106,86,1320,166,17,200.0
295,19,4,1,2,3,1,1,19,4,1,2,3,1,692,70,8,920.0
314,408,117,11,66,41,34,1,408,117,11,66,41,34,942,72,11,120.0
315,593,172,22,82,100,57,1,593,172,22,82,100,57,1222,139,15,165.0


In [42]:
numer

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,750.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,325,9,3,700.0
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,313,381,20,875.0
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,37,113,7,385.0
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,1314,131,12,960.0


In [43]:
#aykiri olmayanlar
numer = numer[aykiri_tf]

# Kategorik değişkenlerden, indexi aykırı olanları uçuruyorum
cat_df = cat_df.drop([217,236,274,295,314,315])

In [44]:
numer

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,750.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,325,9,3,700.0
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,313,381,20,875.0
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,37,113,7,385.0
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,1314,131,12,960.0


In [45]:
cat_df

Unnamed: 0,League_N,NewLeague_N,Division_W
1,1,1,1
2,0,0,1
3,1,1,0
4,1,1,0
5,0,0,1
...,...,...,...
317,1,1,0
318,0,0,0
319,0,0,1
320,0,0,0


> THE STANDARDIZATION OF VARIABLES 


In [46]:
y = numer["Salary"]
X = numer.drop('Salary', axis=1)
#scaler = StandardScaler()
#X = scaler.fit_transform(X)

from sklearn import preprocessing
preprocessing.normalize(X)

array([[0.08565732, 0.02202617, 0.0019035 , ..., 0.1718585 , 0.0116929 ,
        0.00271928],
       [0.23703564, 0.06433118, 0.00890739, ..., 0.43547257, 0.04057813,
        0.00692797],
       [0.08262421, 0.02348793, 0.00333162, ..., 0.03331622, 0.00183239,
        0.00049974],
       ...,
       [0.25690321, 0.06814696, 0.00162255, ..., 0.02001141, 0.06111592,
        0.00378594],
       [0.15544157, 0.03906385, 0.00244149, ..., 0.35645763, 0.03553725,
        0.00325532],
       [0.12009821, 0.03235609, 0.00171297, ..., 0.07765463, 0.00076132,
        0.00057099]])

In [48]:
X = pd.concat([X,cat_df], axis=1)
X

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,NewLeague_N,Division_W
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,1,1,1
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,0,0,1
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,1,1,0
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,1,1,0
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,325,9,3,1,1,0
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,313,381,20,0,0,0
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,37,113,7,0,0,1
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,1314,131,12,0,0,0


**THE PHASE OF SETTING A MODEL**

In [49]:
#y = df["Salary"]
#X = df.drop('Salary', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=46)

> LET`s SEE LINEAR REGRESSION

In [50]:
linreg = LinearRegression()
linreg_model = linreg.fit(X_train,y_train)
linreg_y_pred = linreg_model.predict(X_test)
linreg_rmse = np.sqrt(mean_squared_error(y_test,linreg_y_pred))
linreg_rmse

249.89100620497643

**LET`S SEE RIDGE REGRESSION RESULTS**

In [51]:
ridreg = Ridge()
ridreg_model = ridreg.fit(X_train, y_train)
ridreg_y_pred = ridreg_model.predict(X_test)
ridreg_rmse = np.sqrt(mean_squared_error(y_test,ridreg_y_pred))
ridreg_rmse

249.5448161967652

*LET`S SEE HIPERPARAMETER OPTIMIZATIONS OF THE RIDGE REGRESSION *

In [52]:
alpha = [0.1,0.01,0.001,0.2,0.3,0.5,0.8,0.9,1]
ridreg_cv = RidgeCV(alphas = alpha, scoring = "neg_mean_squared_error", cv = 10, normalize = True)
ridreg_cv.fit(X_train, y_train)
print(ridreg_cv.alpha_)
#Final Model 
ridreg_tuned = Ridge(alpha = ridreg_cv.alpha_).fit(X_train,y_train)
ridreg_tuned_y_pred = ridreg_tuned.predict(X_test)
ridreg_tuned_rmse = np.sqrt(mean_squared_error(y_test,ridreg_tuned_y_pred))
print(ridreg_tuned_rmse)

0.5
249.70628171769428


In [54]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, ElasticNet, Lasso, LassoCV
from sklearn.metrics import mean_squared_error

**Lasso Regresyon**

In [55]:
lasreg = Lasso()
lasreg_model = lasreg.fit(X_train,y_train)
lasreg_y_pred = lasreg_model.predict(X_test)
lasreg_rmse = np.sqrt(mean_squared_error(y_test,lasreg_y_pred))
lasreg_rmse

248.80428167481855

*LET`S SEE HIPERPARAMETER OPTIMIZATIONS OF THE LASSO REGRESSION*

In [56]:
alpha = [0.1,0.01,0.001,0.2,0.3,0.5,0.8,0.9,1]
lasso_cv = LassoCV(alphas = alpha, cv = 10, normalize = True)
lasso_cv.fit(X_train, y_train)
print(lasso_cv.alpha_)
#Final Model 
lasso_tuned = Lasso(alpha = lasso_cv.alpha_).fit(X_train,y_train)
lasso_tuned_y_pred = lasso_tuned.predict(X_test)
lasso_tuned_rmse = np.sqrt(mean_squared_error(y_test,lasso_tuned_y_pred))
print(lasso_tuned_rmse)

1.0
248.80428167481855


**ElasticNet Regresyon**

In [57]:
enet = ElasticNet()
enet_model = enet.fit(X_train,y_train)
enet_y_pred = enet_model.predict(X_test)
enet_rmse = np.sqrt(mean_squared_error(y_test,enet_y_pred))
enet_rmse

247.2374027204521

*LET`S SEE HIPERPARAMETER OPTIMIZATIONS OF THE ELASTICNET REGRESSION*

In [58]:
enet_params = {"l1_ratio": [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
              "alpha":[0.1,0.01,0.001,0.2,0.3,0.5,0.8,0.9,1]}

enet_model = ElasticNet().fit(X_train,y_train)
enet_cv = GridSearchCV(enet_model, enet_params, cv = 10).fit(X, y)
print(enet_cv.best_params_)

#Final Model 
enet_tuned = ElasticNet(**enet_cv.best_params_).fit(X_train,y_train)
enet_tuned_y_pred = enet_tuned.predict(X_test)
enet_tuned_rmse = np.sqrt(mean_squared_error(y_test,enet_tuned_y_pred))
print(enet_tuned_rmse)

{'alpha': 1, 'l1_ratio': 0.7}
247.16065803965932


**Comments
My aim in this study is to set up 4 machine learning models for the Hitters data set and minimize error scores. The works I have done for this purpose are as follows:

1). Hitters Data Set was read.
2). With Exploratory Data Analysis;
Structural information of the dataset was examined.
The types of variables in the dataset were examined.
Converting the object type to categorical type was done.
Size information of the dataset was accessed.
The number of missing observations from which variable in the data set was accessed. Only in our dependent variable "Salary", 59 missing observations were observed and these values ​​were cleared from the data set due to missing observation in the dependent variable.
descriptive statistics of the data set were examined.
The categorical variables in the data set were accessed and the classes of these variables were visualized.
Continuous variables in the dataset were accessed and descriptive statistics were examined.
3). In the Data Pre-Processing section;
Separate operations were performed for continuous variable and categorical variables in the data set. Because after applying the one-hot encoding method to categorical variables, it was not desired to include categorical variables in outlier observation analysis.

One-hot-encoding transformation was applied to categorical variables.
For continuous observations contrary to variables, multivariate observation analysis was performed with LOF method and outliers were removed from the dataset.
indexes removed from continuous variables are also deleted from the categorical variable dataframe.
Variable standardization was applied over the discarded version of the dependent variable (Salary), which does not include the categorical variable.
After variable standardization, before the model was established, standardized continuous variable and categorical variables were combined and the data set was finalized.
4). During the Model Building phase;
One of the methods we learned; By applying Linear, Ridge, Lasso, ElasticNet machine learning models, model errors representing the difference between real values ​​and predicted values ​​were calculated. Later, hyperparameter optimizations were applied for Ridge, Lasso and ElasticNet to further reduce the error value.*