In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

from sklearn.model_selection import RandomizedSearchCV

# dataset

In [2]:
data = pd.read_csv("/content/Salary_dataset.csv")

# data preprocessing

In [3]:
df = data.copy()

In [4]:
df.shape


(30, 3)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,YearsExperience,Salary
0,0,1.2,39344.0
1,1,1.4,46206.0
2,2,1.6,37732.0
3,3,2.1,43526.0
4,4,2.3,39892.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       30 non-null     int64  
 1   YearsExperience  30 non-null     float64
 2   Salary           30 non-null     float64
dtypes: float64(2), int64(1)
memory usage: 848.0 bytes


In [7]:
df.drop("Unnamed: 0",axis=1,inplace=True)

In [8]:
df["YearsExperience"] = np.round(df["YearsExperience"],2)

In [9]:
df.head()

Unnamed: 0,YearsExperience,Salary
0,1.2,39344.0
1,1.4,46206.0
2,1.6,37732.0
3,2.1,43526.0
4,2.3,39892.0


In [11]:
df["Salary"] = df["Salary"].astype(int)

In [13]:
df.isnull().sum()

Unnamed: 0,0
YearsExperience,0
Salary,0


In [12]:
df.head()

Unnamed: 0,YearsExperience,Salary
0,1.2,39344
1,1.4,46206
2,1.6,37732
3,2.1,43526
4,2.3,39892


# feature split

In [14]:
# independent feature
x = df.drop("Salary",axis=1)
# dependent feature
y = df["Salary"]

# train test split

In [15]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# model training

In [16]:
reg = GradientBoostingRegressor()

In [17]:
reg.fit(x_train,y_train)

In [18]:
pred = reg.predict(x_test)

# model evaluation

In [20]:
print("r2_score:- " ,r2_score(y_test,pred))

mse:-  99492682.61295848
r2_score:-  0.8052202961617161


# hyperparameter tuning


In [22]:
params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
}

In [25]:
random_search = RandomizedSearchCV(reg, param_distributions=params, n_iter=10, cv=5, random_state=42)

In [26]:
random_search.fit(x_train, y_train)

In [27]:
random_search.best_params_

{'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.01}

In [28]:
pred = random_search.predict(x_test)

In [30]:
print("r2_score:- " ,r2_score(y_test,pred))

r2_score:-  0.804039494621643
