In [91]:
import numpy as np
import pandas as pd
import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [92]:
data = pd.read_csv('task_b.csv')
data=data.iloc[:,1:]

In [93]:
data.head()

Unnamed: 0,f1,f2,f3,y
0,-195.871045,-14843.084171,5.53214,1.0
1,-1217.183964,-4068.124621,4.416082,1.0
2,9.138451,4413.412028,0.425317,0.0
3,363.824242,15474.760647,1.094119,0.0
4,-768.812047,-7963.932192,1.870536,0.0


In [94]:
data.corr()['y']

f1    0.067172
f2   -0.017944
f3    0.839060
y     1.000000
Name: y, dtype: float64

In [95]:
data.std()

f1      488.195035
f2    10403.417325
f3        2.926662
y         0.501255
dtype: float64

In [96]:
X=data[['f1','f2','f3']].values
Y=data['y'].values
print(X.shape)
print(Y.shape)

(200, 3)
(200,)


<pre>
Observe how linear models work in case of data having feautres with different variance from the output of the above cells you can observe that var(F2)>>var(F1)>>Var(F3)

> <b>1. without standardization</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' and check the feature importance

> <b>2. with standardization</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance

</pre>

## 1. Without Standardization

### Logistic Regression (SGDClassifier with logloss) without Standardization

In [97]:
clf_lr = SGDClassifier(loss='log', random_state=42)
clf_lr.fit(X, Y)

# feature importance
for i, j in enumerate(clf_lr.coef_[0]):
  print("F{} : {}".format(i+1, j))

F1 : 8252.617126394438
F2 : -9979.999399845494
F3 : 10367.642231334077


### Observation:
- F2 feature which has high variance and negative correlation w.r.t 'y' means that feature 2 classifies or moves the values towards negative class.
- Most important feature is f3 as it has highest value f1 has the second highest positive value. Feature importance according to their value : F3 > F1 > F2
- As the data is not standardized negative value doesn't mean that the feature is less important.

### SVM Classifier without Standardization

In [98]:
clf_svm = SGDClassifier(loss='hinge', max_iter=100000, random_state=42)
clf_svm.fit(X, Y)

# feature importance
for i, j in enumerate(clf_svm.coef_[0]):
  print("F{} : {}".format(i+1, j))

F1 : -7107.373899102203
F2 : 9364.07983619022
F3 : 9088.735939707904


### Observation:
- F2 feature has the highest feature importance value in SGDClassifier with hinge loss.
- Feature importance according to their value : F2 > F3 > F1
- F1 which has low variance and lowest positive correlation w.r.t 'y' means that feature 1 classifies or moves the values towards negative class.
- F2 has high variance and negative correlation w.r.t 'y' but has positive feature importance value in SVM model.
- F3 has low variance and highest correlation w.r.t. 'y' and has high value in both the models. So it must be more important feature than the other two features.

## 2. With Standardization

In [99]:
std_scaler = StandardScaler()
x_std = std_scaler.fit_transform(X)

### Logistic Regression with Standardization

In [100]:
clf_lr_std = SGDClassifier(loss='log', random_state=42)
clf_lr_std.fit(x_std, Y)

# feature importance
for i, j in enumerate(clf_lr_std.coef_[0]):
    print("F{} : {}".format(i+1, j))

F1 : 2.306949148477717
F2 : 4.394031919103951
F3 : 11.532839429320834


### Observation:
- After standardizing the values F3 is the most important feature as it has low variance and high correlation w.r.t 'y'.

### SVM Classifier with Standardization

In [101]:
clf_svm_std = SGDClassifier(loss='hinge', random_state=42)
clf_svm_std.fit(x_std, Y)

# feature importance
for i, j in enumerate(clf_svm_std.coef_[0]):
    print("F{} : {}".format(i+1, j))

F1 : -1.9661832545379097
F2 : 2.430602858071307
F3 : 13.547043799043973


### Observation:
- After standardizing the values for SVM model as well the F3 model has the highest feature importance.
- SVM model with or without standardization classifies F1 as negative feature.
- Feature importance for both models after standardization: F3 > F2 > F1