In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## Peek at the data

In [None]:
train.head()

## Summary of the data

In [None]:
train.describe()

## Looking at the Data types

In [None]:
train.dtypes

## Checking the missing values

In [None]:
train.apply(lambda x:sum(x.isnull()),axis=0)

## Filling missing values with mean/mode

In [None]:
train[''].fillna(mode(train['']).mode[0],inplace = True)
train[''].fillna(np.math.ceil(np.mean(train[''])),inplace = True)

## Basic Pivot table similar to Pivot table in Excel

In [None]:
pd.pivot_table(train,index=["Outlet_Size"],values=["Item_Outlet_Sales"],
                  columns=["Item_Fat_Content"],aggfunc=[np.mean])

## Subsetting Data

### Subsetting by Names

In [None]:
train1 = train[['Item_Identifier','Item_Fat_Content','Item_Outlet_Sales']]

#(or)

remove_cols = ['Item_Fat_Content','Item_Outlet_Sales']
train.drop(remove_cols,axis=1,inplace=True)

### Subsetting by Indices

In [None]:
train1 = train[['Item_Identifier','Item_Fat_Content','Item_Outlet_Sales']]

In [None]:
train1.head()

#### Selecting the required rows

In [None]:
train2 = train1.iloc[0:2,:]

In [None]:
train2

#### Select the first n rows

In [None]:
train2 = train1.iloc[:5,:]

In [None]:
train2

#### Remove the first n rows

In [None]:
train2 = train1.iloc[5:,:]

In [None]:
train2

#### Select the last n rows

In [None]:
train2 = train1.iloc[-5:,:]

In [None]:
train2

#### Remove the last n rows

In [None]:
train2 = train1.iloc[:-5,:]

In [None]:
train2

#### Selecting the required Cols

In [None]:
train1 = train[['Item_Identifier','Item_Fat_Content','Item_Outlet_Sales','Item_Visibility','Outlet_Location_Type']]
train1.head()

In [None]:
train2 = train1.iloc[:,0:2]
train2.head()

#### Selecting the first n cols

In [None]:
train2 = train1.iloc[:,:3]
train2.head()

#### Removing the first n cols

In [None]:
train2 = train1.iloc[:,3:]
train2.head()

#### Selecting the last n cols

In [None]:
train2 = train1.iloc[:,-3:]
train2.head()

#### Removing the last n cols

In [None]:
train2 = train1.iloc[:,:-3]
train2.head()

## Splitting of Categorical and Continuous variables

In [None]:
cont_columns = []
cat_columns = []

for i in train.columns:
    if train[i].dtype == 'float':
        cont_columns.append(i)
    elif train[i].dtype == 'object':
        cat_columns.append(i)

In [None]:
print(cat_columns)
print(cont_columns)

## Removing Variables from a Data set

In [None]:
ID = train['Item_Identifier']
#Drop unnecessary columns
train.drop('id',axis=1,inplace=True)

## Removing variables from a list

In [None]:
cont_columns.pop(14)

## Log Transformation

In [None]:
train['Item_Outlet_Sales'] = np.log1p(train['Item_Outlet_Sales'])

# Visualizations

## Pair wise Visualizations with target variable

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
sns.pairplot(train[cont_columns], vars=['Item_Weight','Item_Visibility','Item_MRP','Item_Outlet_Sales'], kind = 'scatter',diag_kind='kde')

## Simple Scatter Plot

In [None]:
ax1 = train.plot.scatter(x='Item_MRP',
                        y='Item_Outlet_Sales',
                        c='DarkBlue')

## Histogram of Target variable

In [None]:
target = 'Item_Outlet_Sales'
plt.figure(figsize=(10,10))
plt.hist(np.log1p(train.Item_Outlet_Sales))
plt.title("Histogram of Target Variable- %s" %target)
plt.show()

## Correlation Matrix for continuous variables

In [None]:
correlationMatrix = train[cont_columns].corr().abs()

plt.subplots(figsize=(13, 9))
sns.heatmap(correlationMatrix,annot=True)

# Mask unimportant features
sns.heatmap(correlationMatrix, mask=correlationMatrix < 1, cbar=False)
plt.show()


## Continuous Variables Distrbution

In [None]:
plt.figure(figsize=(13,9))
sns.boxplot(x="Item_Weight", data =train,orient='v')

## Skewness in Continuous variables

In [None]:
from scipy import stats

skewness_list = []
for cn in train[cont_columns].columns:
    skewness_list.append(stats.skew(train[cn]))

plt.figure(figsize=(10,7))
plt.plot(skewness_list, 'bo-')
plt.xlabel("continous features")
plt.ylabel("skewness")
plt.title("plotting skewness of the continous features")
plt.xticks(range(15), range(1,15,1))
plt.plot([(0.25) for i in range(0,14)], 'r--')
plt.text(6, .1, 'threshold = 0.25')
plt.show()


## Histogram of Skewed Columns

In [None]:
skewed_cont_columns = []
for i, cn in enumerate(cont_columns):
    if skewness_list[i] >= 0.25:
        skewed_cont_columns.append(cn)

In [None]:
import matplotlib.gridspec as gridspec

plt.figure(figsize=(15,25))
gs = gridspec.GridSpec(6, 2)
for i, cn in enumerate(skewed_cont_columns):
    ax = plt.subplot(gs[i])
    sns.distplot(train[cn], bins=50)
    ax.set_xlabel('')
    ax.set_title('hist plot of feature: ' + str(cn))
plt.show()

## Frequency Plot of all Categorical Variables

In [None]:
cols = ['Item_Fat_Content','Outlet_Size','Outlet_Location_Type']

#Plot count plot for all attributes in a 29x4 grid
n_cols = 3
n_rows = 1
for i in range(n_rows):
    fg,ax = plt.subplots(nrows=1,ncols=n_cols,sharey=True,figsize=(12, 8))
    for j in range(n_cols):
        sns.countplot(x=cols[i*n_cols+j], data=train, ax=ax[j])

## Taking Count, CountUniq, Cumsum, Mean at certain levels

In [None]:
## Count (Eg: Count of Oulet Size)
group_cols = ['Outlet_Size']
agg_name = 'Outlet_Size_Count'
gp = train[group_cols][group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
print(gp)
#train = train.merge(gp, on=group_cols, how='left')

## Count Unique (Eg: Unique count of Location Type at Outlet Size level)
group_cols = ['Outlet_Size']
counted = 'Outlet_Location_Type'
gp = train[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted:agg_name})
#print(gp)
train = train.merge(gp, on=group_cols, how='left')

## Cummulative Count (Eg: Cum count of Outlet Size at Item Fat Content Level level)
group_cols = ['Item_Fat_Content']
counted = 'Outlet_Size'
gp = train[group_cols+[counted]].groupby(group_cols)[counted].cumcount()
#print(gp)
train = train.merge(gp, on=group_cols, how='left')

## Mean (Eg: Mean of Sales at Item_Fat_Content Level)
group_cols = ['Item_Fat_Content']
counted = 'Item_Outlet_Sales'
agg_name = 'Item_Fat_Content_mean_Sales'
gp = train[group_cols+[counted]].groupby(group_cols)[counted].mean().reset_index().rename(columns={counted:agg_name})
#print(gp)
train = train.merge(gp, on=group_cols, how='left')

## Mean (Eg: Mean of Sales at Item_Fat_Content Level)
group_cols = ['Item_Fat_Content']
counted = 'Item_Outlet_Sales'
agg_name = 'Item_Fat_Content_variance_Sales'
gp = train[group_cols+[counted]].groupby(group_cols)[counted].var().reset_index().rename(columns={counted:agg_name})
#print(gp)
train = train.merge(gp, on=group_cols, how='left')

## Taking Lags

In [None]:
train1 = train
train1['lag1'] = train1['Item_Outlet_Sales'].shift()

## Lag at a certain Level
train1['Lag_Fat_Content_Level'] = train1.groupby('Item_Fat_Content')['Item_Outlet_Sales'].shift(1)

In [None]:
train1.head()

## Sorting of Data Frame

In [None]:
train1 = train1.sort_index(by=['Item_Fat_Content', 'Item_Type'], ascending=[True, True])
train1.head()

In [None]:
train.head()

## One Hot Encoding of Categorical Features

In [None]:
train = pd.get_dummies(train, columns=['Item_Fat_Content','Outlet_Size'], drop_first=True)

## Label Encoding of Categorical Featurs

In [None]:
from sklearn.preprocessing import LabelEncoder

train1 = train
le = LabelEncoder().fit(train1['Item_Fat_Content'])
train1['Item_Fat_Content'] = le.transform(train1['Item_Fat_Content'])
print (train1.head())

## Reverse Tranform
train1['Item_Fat_Content']= (le.inverse_transform(train1['Item_Fat_Content']))
print (train1.head())

## Scaling and PCA

In [None]:
train1 = train

In [None]:
from sklearn import preprocessing 
le_sex = preprocessing.LabelEncoder()
for i in cat_columns:
    train1[i] = le_sex.fit_transform(train1[i])

In [None]:
from sklearn.preprocessing import scale
X=train1.values

#Scaling the values
X = scale(X)

In [None]:
pca = PCA(n_components=131)
pca.fit(X)

In [None]:
var= pca.explained_variance_ratio_
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

## Train and Validation Split

In [None]:
from sklearn.cross_validation import train_test_split
X = train.loc[:, train.columns != 'Item_Outlet_Sales']
y = train['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

## Modelling

### Linear Regression

In [None]:
from sklearn import linear_model
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error
# The mean Absolute error
print("Mean Absolute error: %.2f"
      % np.mean(abs(regr.predict(X_val) - Y_val)))

In [None]:
# Plot outputs
plt.scatter(regr.predict(X_val), Y_val,color='black')

plt.xticks(())
plt.yticks(())

plt.show()

### Logistic Regression

In [None]:
import statsmodels.api as sm

res = sm.Logit(y_train,X_train)

## Print the Summary
res.summary()

# fit the model
result = logit.fit()

### Random Forest

In [None]:
## For Classification
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=500)
model.fit(X_train,Y_train)

In [None]:
## For Regression
from sklearn import tree
model = tree.DecisionTreeRegressor() 
model.fit(X_train,Y_train)

In [None]:
#Predict Output
predicted= model.predict(X_val)

### XGBoost

In [None]:
dtrain = xgb.DMatrix(X_train,y_train)
dvalid = xgb.DMatrix(X_val, y_val)

dtrain = xgb.DMatrix(train_x,train_y)
params = {
    "objective": "reg:linear",
    "booster": "gbtree",
    "max_depth":6,
    "eval_metric": "mae",
    "eta": 0.05,
    "silent": 1,
    'colsample':0.9,
    'subsample':0.9
}

In [None]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

xgb = xgb.train(params, dtrain,500, evals=watchlist,
                early_stopping_rounds=10, verbose_eval=True)