## What is Overfitting
   It means that the model is memorizing the train data so well that it do really bad on the test data it's like that the model is not generalized enough to understand any unseen data
## How to know that you overfitted
   When the train score is really high and the test score is really low 
## How to fix it :
* Cross validation
* Early stopping
* Regulrization
* Train with more data
* Remove Features
* Ensembling

For more Info check this article [overfitting-in-machine-learning](https://elitedatascience.com/overfitting-in-machine-learning)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [None]:
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
import time
%matplotlib inline

# Loading Data

In [None]:
df = pd.read_csv('../input/older-dataset-for-dont-overfit-ii-challenge/train.csv')
df

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.duplicated().sum()

*NO duplicates*

In [None]:
df.isnull().sum()

*No Missing values*

In [None]:
sns.histplot(df['target'])

In [None]:
df.info()

# Spliting and Scaling The Data

In [None]:
X=df.copy()
Y=(df['target']).astype(int)
X.drop(['target','id'], axis=1,inplace=True)



In [None]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
df1 = minmax.fit_transform(X)
X = pd.DataFrame(df1)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X,Y, train_size=0.8, test_size=0.2,random_state=0)
y_valid

# Grid Search

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.neighbors import NearestCentroid
models=[RandomForestClassifier(),LogisticRegression(),SGDClassifier(),tree.DecisionTreeClassifier(),NearestCentroid(),SVC()]
model_names=['RandomForestclassifier','LogisticRegression','SGDClassifier','DecisionTreeClassifier','NearestCentroid','SVC']
roc=[]
d={}

for model in range (len(models)):
    clf=models[model]
    clf.fit(X_train,y_train)
    print("model_name : ",model_names[model])
    print(clf.get_params())
    test_pred=clf.predict(X_valid)
    #print(test_pred)
    roc.append(roc_auc_score(y_valid,test_pred))
    
d={'Modelling Algo':model_names,'Roc':roc}   
d

In [None]:
roc_frame=pd.DataFrame(d)
roc_frame

In [None]:
sns.factorplot(y='Modelling Algo',x='Roc',data=roc_frame,kind='bar',size=5,aspect=2)

# Model

In [None]:
def scoring_roc_auc(y, y_pred):
    try:
        return roc_auc_score(y, y_pred)
    except:
        return 0.5
from sklearn.metrics import make_scorer
roc_auc = make_scorer(scoring_roc_auc)

In [None]:

param_grid =  {
        'C': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
        'tol': [0.00009, 0.0001, 0.00011],
        'max_iter': [int(x) for x in np.linspace(start = 100, stop = 10000, num = 32)],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['liblinear','sag']
    }

best = GridSearchCV(estimator=LogisticRegression(random_state=42, class_weight='balanced'), param_grid=param_grid, scoring=roc_auc, cv=20, n_jobs=-1,verbose=3)
best.fit(X_train, y_train)
print(best.best_score_,best.best_estimator_,best.best_params_)



In [None]:
pred=best.predict(X_valid)
score=roc_auc_score(y_valid,pred)
score

# Testing

In [None]:
test=pd.read_csv('../input/older-dataset-for-dont-overfit-ii-challenge/test.csv')
test


In [None]:
test2=test['id']
test.drop('id',axis=1,inplace=True)
test_final = minmax.fit_transform(test)
test_final = pd.DataFrame(test_final)
test_final

In [None]:
pred=best.predict(test_final)

In [None]:
predictions = pd.DataFrame({'id':test2,
                       'target': pred})

# 😄 Generating Submission File

In [None]:
predictions.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")