In [1]:
from pprint import pprint
import pandas as pd
from sklearn.datasets import load_iris
iris_dataset = load_iris()

In [2]:
iris_dataset.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [3]:
iris_dataset.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [4]:
iris = pd.DataFrame(data = iris_dataset.data,columns = iris_dataset.feature_names)
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
sepal length (cm)    150 non-null float64
sepal width (cm)     150 non-null float64
petal length (cm)    150 non-null float64
petal width (cm)     150 non-null float64
dtypes: float64(4)
memory usage: 4.8 KB


In [6]:
# Strip non-numerics
iris = iris.select_dtypes(include=['int', 'float'])

headers = list(iris.columns.values)
fields = []

for field in headers:
    fields.append({
        'name' : field,
        'mean': iris[field].mean(),
        'var': iris[field].var(),
        'sdev': iris[field].std()
    })

for field in fields:
    pprint(field)

{'mean': 5.843333333333335,
 'name': 'sepal length (cm)',
 'sdev': 0.8280661279778629,
 'var': 0.6856935123042505}
{'mean': 3.057333333333334,
 'name': 'sepal width (cm)',
 'sdev': 0.435866284936698,
 'var': 0.1899794183445188}
{'mean': 3.7580000000000027,
 'name': 'petal length (cm)',
 'sdev': 1.7652982332594667,
 'var': 3.1162778523489942}
{'mean': 1.199333333333334,
 'name': 'petal width (cm)',
 'sdev': 0.7622376689603465,
 'var': 0.5810062639821029}


In [7]:
pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 0)
df2 = pd.DataFrame(fields)
display(df2)

Unnamed: 0,mean,name,sdev,var
0,5.843333,sepal length (cm),0.828066,0.685694
1,3.057333,sepal width (cm),0.435866,0.189979
2,3.758,petal length (cm),1.765298,3.116278
3,1.199333,petal width (cm),0.762238,0.581006


### Outliers are values that are unusually high or low. Sometimes outliers are simply errors; 
### this is a result of observation error. Outliers can also be truly large or small values that may be difficult to address. 
### We typically consider outliers to be a value that is several standard deviations from the mean.
### The following function can remove such values.

In [8]:
df = iris
# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean())
                          >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)

In [9]:
# Drop outliers in iris-dataset
#where the sepal width is more than two standard deviations above or below the mean.
import numpy as np
print("Length before sepal width outliers dropped: {}".format(len(df)))
remove_outliers(df,'sepal width (cm)',2)
print("Length after sepal width outliers dropped: {}".format(len(df)))

Length before sepal width outliers dropped: 150
Length after sepal width outliers dropped: 145


# Dropping Fields

In [10]:
# Some fields are of no value to the neural network should be dropped.
# The following code removes the name column from the MPG dataset.
# df.drop('name', 1, inplace=True)

# Training and Validation

In [11]:
# # Usually a good idea to shuffle
# df = df.reindex(np.random.permutation(df.index)) 
# mask = np.random.rand(len(df)) < 0.8
# trainDF = pd.DataFrame(df[mask])
# # validationDF = pd.DataFrame(df[~mask])
# print(f"Training DF: {len(trainDF)}")
# print(f"Validation DF: {len(validationDF)}")

In [12]:
# import os
# import pandas as pd
# import numpy as np

# path = "."  # put path of your directory 

# df = pd.read_csv(
#     "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv",
#     na_values=['NA','?'])

# filename_write = os.path.join(path, "auto-mpg-shuffle.csv")
# df = df.reindex(np.random.permutation(df.index))
# # Specify index = false to not write row numbers
# df.to_csv(filename_write, index=False) 
# print("Done")

### Pickle will restore your index (helpful in stock data)

# Encoding Continuous Value

### Always normalize(Get Z score which tells us how far(std dev) we are from our mean value.It fixes range 

In [13]:
import os
import pandas as pd
from scipy.stats import zscore

df1= pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv",
    na_values=['NA','?'])

pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 5)

df1['mpg'] = zscore(df1['mpg'])
display(df1)

Unnamed: 0,mpg,cylinders,displacement,...,year,origin,name
0,-0.706439,8,307.0,...,70,1,chevrolet chevelle malibu
1,-1.090751,8,350.0,...,70,1,buick skylark 320
...,...,...,...,...,...,...,...
396,0.574601,4,120.0,...,82,1,ford ranger
397,0.958913,4,119.0,...,82,1,chevy s-10


# Encoding Continuous Value

In [14]:
import pandas as pd

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 0)

display(df)

Unnamed: 0,id,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product
0,1,vv,c,50876.0,13.100000,1,9.017895,35,11.738935,49,0.885827,0.492126,0.071100,b
1,2,kd,c,60369.0,18.625000,2,7.766643,59,6.805396,51,0.874016,0.342520,0.400809,c
2,3,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b
3,4,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b
4,5,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a
5,6,e2,c,70854.0,40.400000,1,14.893343,87,20.340593,43,0.866142,0.673228,0.473581,d
6,7,kl,d,38726.0,30.975000,3,3.822477,33,9.480399,39,0.976378,0.874016,0.092151,f
7,8,nb,a,55162.0,26.966667,2,4.312097,17,29.219896,44,1.000000,0.724409,0.162833,b
8,9,al,c,67311.0,32.383333,0,25.093772,169,10.927357,45,0.952756,0.681102,0.096333,c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [15]:
areas = list(df['area'].unique())
areas

['c', 'd', 'a', 'b']

In [16]:
len(areas)

4

In [17]:
dummies = pd.get_dummies(df['area'],prefix='area')
dummies

Unnamed: 0,area_a,area_b,area_c,area_d
0,0,0,1,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,0,0,1
5,0,0,1,0
6,0,0,0,1
7,1,0,0,0
8,0,0,1,0
...,...,...,...,...


In [18]:
df = pd.concat([df,dummies],axis=1)
df.drop('area', axis=1, inplace=True)
df

Unnamed: 0,id,job,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product,area_a,area_b,area_c,area_d
0,1,vv,50876.0,13.100000,1,9.017895,35,11.738935,49,0.885827,0.492126,0.071100,b,0,0,1,0
1,2,kd,60369.0,18.625000,2,7.766643,59,6.805396,51,0.874016,0.342520,0.400809,c,0,0,1,0
2,3,pe,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b,0,0,1,0
3,4,11,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b,0,0,1,0
4,5,kl,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a,0,0,0,1
5,6,e2,70854.0,40.400000,1,14.893343,87,20.340593,43,0.866142,0.673228,0.473581,d,0,0,1,0
6,7,kl,38726.0,30.975000,3,3.822477,33,9.480399,39,0.976378,0.874016,0.092151,f,0,0,0,1
7,8,nb,55162.0,26.966667,2,4.312097,17,29.219896,44,1.000000,0.724409,0.162833,b,1,0,0,0
8,9,al,67311.0,32.383333,0,25.093772,169,10.927357,45,0.952756,0.681102,0.096333,c,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


# Target Encoding for Categoricals
###### Target encoding can sometimes increase the predictive power of a machine learning model. However, it also dramatically increases the risk of overfitting. Because of this risk, you must take care if you are using this method.   

# Target encoding is a popular technique for Kaggle competitions.

###### Generally, target encoding can only be used on a categorical feature when the output of the machine learning model is numeric (regression).

###### The concept of target encoding is straightforward. For each category, we calculate the average target value for that category. Then to encode, we substitute the percent that corresponds to the category that the categorical value has. Unlike dummy variables, where you have a column for each category, with target encoding, the program only needs a single column.
###### In this way, target coding is more efficient than dummy variables

In [19]:
def process_string(str):
    t = str.strip()
    return t[0].upper()+t[1:]
l = ['   apple  ', 'pear ', 'orange', 'pine apple  ']
list(map(process_string, l))

['Apple', 'Pear', 'Orange', 'Pine apple']

In [20]:
list1 = ['   apple  ', 'pear ', 'orange', 'pine apple  ']
l2 = [process_string(x) for x in list1]
print(l2)

['Apple', 'Pear', 'Orange', 'Pine apple']


In [21]:

def greater_than_five(x):
    return x>5

l = [ 1, 10, 20, 3, -2, 0]
l2 = list(filter(greater_than_five, l))
print(l2)

[10, 20]


In [22]:
l = [ 1, 10, 20, 3, -2, 0]
l2 = list(filter(lambda x: x>5, l))
print(l2)

[10, 20]


In [23]:
np.random.seed(43)
dog_tiger_data= pd.DataFrame({
    'cont_9': np.random.rand(10)*100,
    'cat_0': ['dog'] * 5 + ['cat'] * 5,
    'cat_1': ['wolf'] * 9 + ['tiger'] * 1,
    'target': [1, 0, 1, 1, 1, 1, 0, 0, 0, 0]
})

pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 0)
display(dog_tiger_data)

Unnamed: 0,cont_9,cat_0,cat_1,target
0,11.505457,dog,wolf,1
1,60.906654,dog,wolf,0
2,13.339096,dog,wolf,1
3,24.058962,dog,wolf,1
4,32.713906,dog,wolf,1
5,85.913749,cat,wolf,1
6,66.609021,cat,wolf,0
7,54.116221,cat,wolf,0
8,2.901382,cat,wolf,0
9,73.37483,cat,tiger,0


In [24]:
# Here we can see that if we apply direct mean value for each of the category 
# we will get error for cat_1 becuz tiger is 1 and will give us wrong results.
dog_tiger_data['target'].to_list()

[1, 0, 1, 1, 1, 1, 0, 0, 0, 0]

In [25]:
means0 = dog_tiger_data.groupby('cat_0')['target'].mean()
means0

cat_0
cat    0.2
dog    0.8
Name: target, dtype: float64

In [26]:
mea_n = dog_tiger_data.groupby('cat_0')['target'].mean().to_dict()
mea_n

{'cat': 0.2, 'dog': 0.8}

# Smoothening the mean for target encoding 

In [27]:
def calc_smooth_mean(df1, df2, cat_name, weight, target):
    # Compute the global mean
    mean = df[ target ].mean()

    # Compute the number of values and the mean of each group
    agg = df.groupby(cat_name)[ target ].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + weight * mean) / (counts + weight)

    # Replace each value by the according smoothed mean
    if df2 is None:
        return df1[cat_name].map(smooth)
    else:
        return df1[cat_name].map(smooth),df2[cat_name].map(smooth.to_dict())

In [28]:
WEIGHT = 5
df['cat_0_enc'] = calc_smooth_mean(df1=dog_tiger_data, df2=None, 
    cat_name='cat_0',target='target' ,weight=WEIGHT)
df['cat_1_enc'] = calc_smooth_mean(df1=dog_tiger_data, df2=None, 
    cat_name='cat_1',target='target' ,weight=WEIGHT)

pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 0)

display(dog_tiger_data)

KeyError: 'target'

# Shuffling

In [None]:
# for same index
# np.random.seed(42) # Uncomment this line to get the same shuffle each time
df = df.reindex(np.random.permutation(df.index))

In [None]:
# Resetting the index values
df.reset_index(inplace=True, drop=True)

# Sorting

In [None]:
df = df.sort_values(by='subscriptions', ascending= True)

In [None]:
df.head()

In [None]:
d = df.groupby('subscriptions')['age'].count().to_dict()
d

In [None]:
d[0]

# Apply And MAP

In [None]:
df1.head()

In [None]:
# Apply the map
df1['origin_name'] = df1['origin'].map({1: 'North America', 2: 'Europe', 3: 'Asia'}) 

# Shuffle the data, so that we hopefully see
# more regions.
df1 = df1.reindex(np.random.permutation(df1.index)) 

# Efficieny using apply 
efficiency = df1.apply(lambda x:x['displacement']/x['horsepower'],axis = 1)
df1['efficiency'] = efficiency

# Display
pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 10)
display(df1)

In [None]:
# import pandas as pd
# Agri_data =pd.read_csv('https://www.irs.gov/pub/irs-soi/16zpallagi.csv')
# Agri_data=Agri_data.loc[(Agri_data['zipcode']!=0) & (Agri_data['zipcode']!=99999),
#           ['STATE','zipcode','agi_stub','N1']]
# Agri_data.head()

In [None]:
medians = {1:12500,2:37500,3:62500,4:87500,5:112500,6:212500}
Agri_data['agi_stub']=df.agi_stub.map(medians)

groups = df.groupby(by='zipcode')
# Apply on groups
Agri_data= pd.DataFrame(groups.apply(    
    lambda x:sum(x['N1']*x['agi_stub'])/sum(x['N1']))) \
    .reset_index()

In [None]:
#how to insert a new column in dataframe using values of other columns

In [None]:
df.insert(1, 'weight_kg', (df['weight'] * 0.45359237).astype(int))

# Great Circle Distance

# Train_test_split for time-series data

In [None]:
Are they any special considerations when creating train/test splits for time series? If so, what and why?
    
Since our model is meant to predict events in the future, we must also validate the model on events in the future.
If the data is mixed up between the training and test sets,then future data will leak in to the model and our validation results will overestimate the performance on new data.

# Feature Engineering

In [None]:
# whenn to check unique elements in a column

In [None]:
# How to remove missing observations

In [None]:
start_id = max(df[df['obs_num'] == 0].index.tolist())+1  # Find the last zero and move one beyond
print(start_id)
df = df[start_id:] # Trim the rows that have missing observations

In [None]:
dff = data_1.copy()
pd.unique(dff.targetcolumn)
dff.groupby('targetcolumn')['anothercolumn'].count()
# Drop live(one of the value in state column) projects(here target column is 'state')
dff = dff.query('state != "live"')

# Add outcome column, "successful" == 1, others are 0
dff= dff.assign(outcome=(dff['state'] == 'successful').astype(int))

#Converting timestamps
# I convert the launched feature(contains dates which are parsed from strings) into categorical features we can use in a model. 
# Since I loaded in the columns as timestamp data, 
# I access date and time values through the .dt attribute on the timestamp column.
dff['day'] = dff['column name containing timestamp as 2017-11-06 15:13:23'].dt.day.astype('uint8')
dff = dff.assign(hour=dff.launched.dt.hour.astype('uint8'),
               day=dff.launched.dt.day,
               month=dff.launched.dt.month,
               year=dff.launched.dt.year)

#Prepping categorical variables
Now for the categorical variables -- I'll need to convert them into integers so our model can use the data. 
For this I'll use scikit-learn's LabelEncoder. 
This assigns an integer to each value of the categorical feature and replaces those values with the integers

from sklearn.preprocessing import LabelEncoder

cat_features = [col for col in dff.columns ifdff[col].dtype == "object"]
encoder = LabelEncoder()

# from sklearn.preprocessing import LabelEncoder
# cat_features = [col for col in dff.columns ifdff[col].dtype == "object"]
# encoder = preprocessing.LabelEncoder()

# # Create new columns in clicks using preprocessing.LabelEncoder()

# for feature in cat_features:
#     dff[feature + '_labels'] = encoder.fit_transform(clicks[feature])

# Apply the label encoder to each column
encoded =dff[cat_features].apply(encoder.fit_transform)
encoded.head(10)

I'll collect all the features we'll use in a new dataframe and use that to train a model.
data = ks[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(encoded)


# For Training(80%),validation(10%) and testing(10%) of a  model
valid_fraction = 0.1
valid_size = int(len(data) * valid_fraction)

train = data[:-2 * valid_size]
valid = data[-2 * valid_size:-valid_size]
test = data[-valid_size:]
for each in [train, valid, test]:
    print(f"Outcome fraction = {each.outcome.mean():.4f}")
    
# A good way to do this automatically is with sklearn.model_selection.StratifiedShuffleSplit
#-------------------Training LightGBM Model-------------------------#

import lightgbm as lgb

feature_cols = train.columns.drop('outcome')

dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False)
from sklearn import metrics
ypred = bst.predict(test[feature_cols])
score = metrics.roc_auc_score(test['outcome'], ypred)

print(f"Test AUC score: {score}")

# Count Encoding for categorical values

In [None]:
Count encoding replaces each categorical value with the number of times it appears in the dataset. 
For example, if the value "GB" occured 10 times in the country feature, then each "GB" would be replaced with the number 10.

In [None]:
# Should write a function such as for splitting so that we will be able to repeat the process again
def get_data_splits(dataframe, valid_fraction=0.1):
    valid_fraction = 0.1
    valid_size = int(len(dataframe) * valid_fraction)

    train = dataframe[:-valid_size * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_size * 2:-valid_size]
    test = dataframe[-valid_size:]
    
    return train, valid, test

In [None]:
import category_encoders as ce

cat_features = ['ip', 'app', 'device', 'os', 'channel']
train, valid, test = get_data_splits(clicks)
 # Create the count encoder
count_enc = ce.CountEncoder(cols=cat_features)
# Learn encoding from the training set
count_enc.fit(train[cat_features])

    # Apply encoding to the train and validation sets
train_encoded = train.join(count_enc.transform(train[cat_features]).add_suffix('_count'))
valid_encoded = valid.join(count_enc.transform(valid[cat_features]).add_suffix('_count'))

# Target Encoding

In [None]:
Target encoding replaces a categorical value with the average value of the target for that value of the feature.
For example, given the country value "CA", you'd calculate the average outcome for all the rows with country == 'CA', around 0.28. 
This is often blended with the target probability over the entire dataset to reduce the variance of values with few occurences.
# Try to check unique levels of category in a column
This technique uses the targets to create new features. So including the validation or test data in the target encodings would be a form of target leakage.
Instead, you should learn the target encodings from the training dataset only and apply it to the other datasets.

In [None]:
import category_encoders as ce
cat_features = ['category', 'currency', 'country']

# Create the encoder itself
target_enc = ce.TargetEncoder(cols=cat_features)

train, valid, _ = get_data_splits(data)

# Fit the encoder using the categorical features and target
target_enc.fit(train[cat_features], train['outcome'])

# Transform the features, rename the columns with _target suffix, and join to dataframe
train = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))

# CatBoost Encoding

In [None]:
Similiar to target Encoding but....
with CatBoost, for each row, the target probability is calculated only from the rows before it.
better than target Encoding

In [None]:
# Replace this only

In [None]:
#target_enc = ce.CatBoostEncoder(cols=cat_features)

In [None]:
encoded = cb_enc.transform(clicks[cat_features])
for col in encoded:
    clicks.insert(len(clicks.columns), col + '_cb', encoded[col])

# Feature Generation

#### Method-1 Interaction

In [None]:
Build interaction features from all pairs of categorical features.
example, if one record has the country "CA" and category "Music", you can create a new value "CA_Music".
Then, label encode the interaction feature and add it to our data.

import itertools
from sklearn.preprocessing import LabelEncoder
interactions = pd.DataFrame(index=dff.index)
for col1, col2 in itertools.combinations(cat_features, 2):
    new_col_name = '_'.join([col1, col2])
    # Convert to strings and combine
    new_values = dff[col1].map(str) + "_" + dff[col2].map(str)
    label_enc = LabelEncoder()
    interactions[new_col_name] = label_enc.fit_transform(new_values)
data = data.join(interactions)

In [None]:
# applying to count no of projects in last week

In [None]:
I'll create the series, using ks.launched as the index and ks.index as the values, then sort the times. 
Using a time series as the index allows us to define the rolling window size in terms of hours, days, weeks, etc.

# First, create a Series with a timestamp index
launched = pd.Series(ks.index, index=ks.launched, name="count_7_days").sort_index()
count_7_days = launched.rolling('7d').count() - 1
# subtracting 1 to remove cuuent project

we need to adjust the index so we can join it with the other training data.
count_7_days.index = launched.values
count_7_days = count_7_days.reindex(ks.index)
data.join(count_7_days)

In [None]:
Time since the last project in the same category
Do projects in the same category compete for donors?
If you're trying to fund a video game and another game project was just launched, you might not get as much money. 
We can capture this by calculating the time since the last launch project in the same category.

In [None]:
def time_since_last_project(series):
    # Return the time in hours
    return series.diff().dt.total_seconds() / 3600.

df = ks[['category', 'launched']].sort_values('launched')
timedeltas = df.groupby('category').transform(time_since_last_project)

# Final time since last project
timedeltas = timedeltas.fillna(timedeltas.median()).reindex(baseline_data.index)
timedeltas.head(20)

data = data.join(timedeltas.rename({'launched': 'time_since_last_project'}, axis=1))

In [None]:
# can change numerical column using np.sqrt or np.log(colname)
# visualize this in histogram

# Count no of times target value = say  8 before current date

In [None]:
 def previous_attributions(series):
        # Subtracting raw values so I don't count the current event
        sums = series.expanding(min_periods=2).sum() - series
        return sums

### These all techniques we have applied for LightGBM model which is a tree network 
### Let's select and understand "Feature selection" for neural networks

# Univariate Feature Selection

In [None]:
From the scikit-learn feature selection module, feature_selection.
SelectKBest returns the K best features given some scoring function. 
For our classification problem, the module provides three different scoring functions:  
    χ2  
    ANOVA F-value:- The F-value measures the linear dependency between the feature variable and the target. This means the score might underestimate the relation between a feature and the target if the relationship is nonlinear. 
    mutual information score :- nonparametric and so can capture nonlinear relationships.

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

feature_cols = data.columns.drop('outcome')
train, valid, _ = get_data_splits(baseline_data)
# Keep 5 features
selector = SelectKBest(f_classif, k=5)

X_new = selector.fit_transform( train[feature_cols], train['outcome'])
X_new 

#we get back an array with only the selected features "for training set"

# we need to find the columns which are dropped so that we can remove it from validation set
we can use ".inverse_transform" to get back an array with the shape of the original data.
# Get back the features we've kept, zero out all other features
selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=train.index, 
                                 columns=feature_cols)
selected_features.head()

We can find the selected columns by choosing features where the variance is non-zero.

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns = selected_features.columns[selected_features.var() != 0]
_ = train_model(train.drop(dropped_columns, axis=1), 
                valid.drop(dropped_columns, axis=1),
                test.drop(dropped_columns, axis=1))

# Get the valid dataset with the selected features.
# valid[selected_columns].head()

In [None]:
Univariate methods consider only one feature at a time when making a selection decision.therefore Use Lasso/Ridge regularization

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

train, valid, _ = get_data_splits(baseline_data)

X, y = train[train.columns.drop("outcome")], train['outcome']

# Set the regularization parameter C=1
def select_features_l1(X, y):
        logistic = LogisticRegression(C=0.1, penalty="l1", random_state=7).fit(X, y)
        model = SelectFromModel(logistic, prefit=True)

        X_new = model.transform(X)

        # Get back the kept features as a DataFrame with dropped columns as all 0s
        selected_features = pd.DataFrame(model.inverse_transform(X_new), 
                                        index=X.index,
                                        columns=X.columns)

        # Dropped columns have values of all 0s, keep other columns 
        cols_to_keep = selected_features.columns[selected_features.var() != 0]

        return cols_to_keep
# Same as above

# Label_Encoding it may happen that after train_test_split test and train has different unique values of classification(even after seeing unique values)Then we need to either drop or write custom encoder

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

# Finding all categorical columns

In [None]:
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if 
                   set(X_train[col]) == set(X_valid[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

In [None]:

# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

In [None]:
from sklearn.preprocessing import LabelEncoder


# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:# Change it to set(good_label_cols)
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

print("MAE from Approach 2 (Label Encoding):") 
# print(score_dataset(label_X_train, label_X_valid, y_train, y_valid)) 

In [None]:
The output above shows, for each column with categorical data, the number of unique values in the column. 
For instance, the 'Street' column in the training data has two unique values: 'Grvl' and 'Pave', 
    corresponding to a gravel road and a paved road, respectively.

We refer to the number of unique entries of a categorical variable as the cardinality of that categorical variable. 
For instance, the 'Street' variable has cardinality 2.

# Cardinality

In [None]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

For large datasets with many rows, one-hot encoding can greatly expand the size of the dataset. 
For this reason, we typically will only one-hot encode columns with relatively low cardinality. 
Then, high cardinality columns can either be dropped from the dataset, or we can use label encoding.

consider a dataset with 10000 rows, and containing one categorical column with 100 unique entries.
# How many entries are added to the dataset by replacing the column with a one-hot encoding?
OH_entries_added = 990000(no of rows *(no of unique entries in col_c1* col_c1+so_on...)-number of entries in the original column.



In [None]:
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))# only here changes happen to low cardinality

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)


# Pipeline

In [None]:
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)
# "Cardinality" means the number of unique values in a column

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),('cat', categorical_transformer, categorical_cols)
    ])

# Defining the model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, criterion="mse",random_state=0)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)


# CrossValidation See  more on kaggle

In [None]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)
print("Average MAE score (across experiments):")
print(scores.mean())

In [None]:
def get_score(n_estimators):
    """Return the average MAE over 3 CV folds of random forest model.
    
    Keyword argument:
    n_estimators -- the number of trees in the forest
    """
    my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),('model', RandomForestRegressor(n_estimators, random_state=0))])
    scores = -1 * cross_val_score(my_pipeline, X, y,cv= 3,scoring='neg_mean_absolute_error')
    return scores.mean()
    pass
List = [50,100,150,200,250,300,350,400]
results = dict(zip(List, (get_score(x) for x in List)))

plt.plot(list(results.keys()), list(results.values()))
plt.show()

# Gradient Boosting/Ensembling

In [None]:
# One-hot encode the data (to shorten the code, we use pandas)
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

# XGBoost Model

In [None]:
# Define the model
from xgboost import XGBRegressor
my_model_2 = XGBRegressor(n_estimators = 500 , learning_rate = 0.01,n_jobs =4)

# Fit the model
my_model_2.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)

# Get predictions
predictions_2 = my_model_2.predict(X_valid)

# Calculate MAE
mae_2 = mean_absolute_error(predictions_2 , y_valid)

# Uncomment to print MAE
print("Mean Absolute Error:" , mae_2)

In [None]:
In Xgboost we have XGB Classifier() which has lots of params which one to choose is difficult choice 
which is solved by randomsearch parameter which will go in permutation and combination of each value of this learning rate  

In [None]:
classifier = xgboost.classifier()
params = {
            learning_rate: [0.01 , 0.10, 0.15 , 0.20 , 0.25 ]
            n_estimators: 100 #f the size of your data is high, 1000 is if it is medium-low
            max_depth: [3, 4 , 5 , 6 , 9 , 10 , 12 ]
            subsample: [0.8,0.9...]
            colsample_bytree: [ 0.1 , 0.5 , 1.0 ,  0.7 ]
            gamma: [ 0 , 1 , 5 ]
         }

# Data Leakage

##### In other words, leakage causes a model to look accurate until you start making decisions with the model, and then the model becomes very inaccurate.

##### There are two main types of leakage: target leakage and train-test contamination.

In [None]:
# For target leakage- Model gives high scores at the time of validation but will give inaccurate results in real-time data. 
To prevent this type of data leakage, 
any variable updated (or created) after the target value is realized should be excluded.

In [None]:
# Train-test contamination -
 Occurs when you aren't careful to distinguish training data from validation data.
    Your model may get good validation scores, giving you great confidence in it, 
    but perform poorly when you deploy it to make decisions.

In [None]:
import pandas as pd

# Read the data
data = pd.read_csv('../input/aer-credit-card-data/AER_credit_card_data.csv', 
                   true_values = ['yes'], false_values = ['no'])

# Select target
y = data.card

# Select predictors
X = data.drop(['card'], axis=1)
# Check some data comparisons on some columns which are not clear

# card: 1 if credit card application accepted, 0 if not
# reports: Number of major derogatory reports
# age: Age n years plus twelfths of a year
# income: Yearly income (divided by 10,000)
# share: Ratio of monthly credit card expenditure to yearly income
# expenditure: Average monthly credit card expenditure
# owner: 1 if owns home, 0 if rents
# selfempl: 1 if self-employed, 0 if not
# dependents: 1 + number of dependents
# months: Months living at current address
# majorcards: Number of major credit cards held
# active: Number of active credit accounts
# A few variables look suspicious. For example, does expenditure mean expenditure on this card or on cards used before appying?


expenditures_cardholders = X.expenditure[y]
expenditures_noncardholders = X.expenditure[~y]

print('Fraction of those who did not receive a card and had no expenditures: %.2f' \
      %((expenditures_noncardholders == 0).mean()))
print('Fraction of those who received a card and had no expenditures: %.2f' \
      %(( expenditures_cardholders == 0).mean()))

# Drop leaky predictors from dataset
potential_leaks = ['expenditure', 'share', 'active', 'majorcards']
X2 = X.drop(potential_leaks, axis=1)

# Evaluate the model with leaky predictors removed
my_pipeline = make_pipeline(RandomForestClassifier(n_estimators=100))
cv_scores = cross_val_score(my_pipeline, X2, y, 
                            cv=5,
                            scoring='accuracy')

print("Cross-val accuracy: %f" % cv_scores.mean())

# This accuracy is quite a bit lower, which might be disappointing. 
# However, we can expect it to be right about 80% of the time when used on new applications, 
# whereas the leaky model would likely do much worse than that (in spite of its higher apparent score in cross-validation).
