In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [2]:
df=pd.read_csv('airfoil_self_noise.dat',sep='\t',header=None)

In [3]:
'''Attribute Information:

This problem has the following inputs:
1. Frequency, in Hertzs.
2. Angle of attack, in degrees.
3. Chord length, in meters.
4. Free-stream velocity, in meters per second.
5. Suction side displacement thickness, in meters.

The only output is:
6. Scaled sound pressure level, in decibels.'''

'Attribute Information:\n\nThis problem has the following inputs:\n1. Frequency, in Hertzs.\n2. Angle of attack, in degrees.\n3. Chord length, in meters.\n4. Free-stream velocity, in meters per second.\n5. Suction side displacement thickness, in meters.\n\nThe only output is:\n6. Scaled sound pressure level, in decibels.'

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [5]:
df.columns=["Frequency","Angle of attack","Chord length","Free-stream velocity","Suction side displacement thickness","Scaled sound pressure level"]


In [6]:
df.head(2)

Unnamed: 0,Frequency,Angle of attack,Chord length,Free-stream velocity,Suction side displacement thickness,Scaled sound pressure level
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201


In [7]:
df.dtypes

Frequency                                int64
Angle of attack                        float64
Chord length                           float64
Free-stream velocity                   float64
Suction side displacement thickness    float64
Scaled sound pressure level            float64
dtype: object

In [8]:
df.isnull().sum()

Frequency                              0
Angle of attack                        0
Chord length                           0
Free-stream velocity                   0
Suction side displacement thickness    0
Scaled sound pressure level            0
dtype: int64

In [9]:
X=df.drop('Scaled sound pressure level',axis=1)
Y=df['Scaled sound pressure level']

In [10]:
## do train train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
# EDA

In [None]:
plt.figure(figsize=(15,20))
import seaborn as sns; sns.set()
sns.pairplot(df)

In [None]:
# Automated EDA
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title='Pandas Profile Report', explorative=True)
profile.to_file("pandas_profile_report.html")

In [11]:
X_train.corr()

Unnamed: 0,Frequency,Angle of attack,Chord length,Free-stream velocity,Suction side displacement thickness
Frequency,1.0,-0.27941,0.01293,0.137387,-0.233708
Angle of attack,-0.27941,1.0,-0.507163,0.05067,0.760036
Chord length,0.01293,-0.507163,1.0,0.005318,-0.22607
Free-stream velocity,0.137387,0.05067,0.005318,1.0,0.003225
Suction side displacement thickness,-0.233708,0.760036,-0.22607,0.003225,1.0


In [None]:
for feature in df.columns:
    plt.figure(figsize=(10,6))
    sns.boxplot(df[feature])
    plt.title('Box plot of' +str(feature))
    plt.xlabel(feature)
    plt.show()
    plt.savefig('Box plot of' +str(feature)+'.png')



In [None]:
len(df['Scaled sound pressure level'].unique())

In [15]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import  accuracy_score,mean_squared_error,r2_score

# Create and fit the Regression  models
dt = DecisionTreeRegressor()
rf =  RandomForestRegressor()
ada = AdaBoostRegressor()
svr = SVR()
lr = LinearRegression()
ridge = Ridge()
lasso = Lasso()
knn = KNeighborsRegressor()
gb = GradientBoostingRegressor()

models = [dt, rf,ada, svr , lr, ridge,lasso,knn,gb]

for model in models:
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    train_mse = mean_squared_error(y_train, train_pred)
    test_mse = mean_squared_error(y_test, test_pred)
    print(f"Model: {model.__class__.__name__}")
    print(f"Train MSE: {train_mse}")
    print(f"Test MSE: {test_mse}")
    print(f"R square train is:{r2_score(y_train, train_pred)}")
    print(f"R square train is:{r2_score(y_test, test_pred)}")
    print()


Model: DecisionTreeRegressor
Train MSE: 0.0
Test MSE: 5.583210445182721
R square train is:1.0
R square train is:0.888555510546345

Model: RandomForestRegressor
Train MSE: 0.44078814053461596
Test MSE: 3.3163891305003395
R square train is:0.9906046667240029
R square train is:0.9338027292528169

Model: AdaBoostRegressor
Train MSE: 14.347284712807978
Test MSE: 15.969326626786893
R square train is:0.6941897726219255
R square train is:0.6812419180121589

Model: SVR
Train MSE: 37.290193312478884
Test MSE: 46.28588562439207
R square train is:0.20516510795375154
R square train is:0.07610380390169424

Model: LinearRegression
Train MSE: 23.296080124160657
Test MSE: 22.128643318247335
R square train is:0.503447537119858
R square train is:0.5582979754897274

Model: Ridge
Train MSE: 24.500317205016636
Test MSE: 24.671013872218413
R square train is:0.47777940388870255
R square train is:0.5075506158529834

Model: Lasso
Train MSE: 34.05631747642758
Test MSE: 37.83893051168915
R square train is:0.27409

In [20]:
from sklearn.model_selection import cross_val_score
for model in models:
    mse=cross_val_score(model,X,Y,scoring='neg_mean_squared_error',cv=5)
    print(f"Model: {model.__class__.__name__}")
    print("mse is ",np.mean(mse))
    print('\n')
    

Model: DecisionTreeRegressor
mse is  -19.1208846389856


Model: RandomForestRegressor
mse is  -13.171585171418425


Model: AdaBoostRegressor
mse is  -22.797355051901178


Model: SVR
mse is  -44.292987062685484


Model: LinearRegression
mse is  -27.188623343053386


Model: Ridge
mse is  -28.408990019900944


Model: Lasso
mse is  -44.73682942704856


Model: KNeighborsRegressor
mse is  -43.23888622642578


Model: GradientBoostingRegressor
mse is  -14.2347860923723




In [21]:
# Best model is Ranfom foreast:- 
# gridsearch cv for increading acuracy 
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': [50,100,150],
              'criterion':['squared_error','absolute_error','friedman_mse'],
              'max_depth':[5,7,9,10,12],
              'max_features':['sqrt','log2']
              }
rf_grid  = GridSearchCV(rf,param_grid=parameters,cv=5)
rf_grid.fit(X,Y)
print(rf_grid.best_score_)
print(rf_grid.best_params_)

0.7137430034385138
{'criterion': 'friedman_mse', 'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 150}


In [22]:
import pickle

with open('model.pkl', 'wb') as file:
    pickle.dump(rf, file)

In [None]:
pickled_model = pickle.load(open('model.pkl', 'rb'))
pickled_model.predict(X_test)