**Importing the required libraries**

In [None]:
#importing the common libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

**Mounting the Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive/',force_remount = True)

**Reading the data from the drive**

In [None]:
eating= pd.read_csv('/content/drive/MyDrive/Cow_prediction/E1_train.csv')
drinking= pd.read_csv('/content/drive/MyDrive/Cow_prediction/D2_train.csv')
walking= pd.read_csv('/content/drive/MyDrive/Cow_prediction/W3_train.csv')
standing= pd.read_csv('/content/drive/MyDrive/Cow_prediction/S4_train.csv')
lying= pd.read_csv('/content/drive/MyDrive/Cow_prediction/L5_train.csv')
ruminating_standing= pd.read_csv('/content/drive/MyDrive/Cow_prediction/RS6_train.csv')
ruminating_lying= pd.read_csv('/content/drive/MyDrive/Cow_prediction/RL7_train.csv')
grooming= pd.read_csv('/content/drive/MyDrive/Cow_prediction/G8_train.csv')
idle= pd.read_csv('/content/drive/MyDrive/Cow_prediction/I9_train.csv')

**concatinating all files in to single dataframe**

In [None]:
# concatenating all the csv files
df = pd.concat([drinking, eating, walking, grooming, idle, lying, ruminating_lying, ruminating_standing, standing])

In [None]:
#viewing the concatenated dataset
df

In [None]:
df.shape

In [None]:
#checking if there are any categorical features
df.info()

###EDA Techniques

In [None]:
#checking for null values
df.isnull().sum()

**pandas-profiling generates profile reports from a pandas DataFrame**

In [None]:
!pip install -U pandas_profiling


**Generating the profile report**

In [None]:
from pandas_profiling import ProfileReport 
ProfileReport(df,minimal= True)

**Generates profile reports from a pandas DataFrame** 

In [None]:
!pip install AutoViz
!pip install xlrd

In [None]:
#importing Autoviz class
from autoviz.AutoViz_Class import AutoViz_Class

#Instantiate the AutoViz class
AV = AutoViz_Class()

In [None]:
df.to_csv('file1.csv')

In [None]:
dftc = AV.AutoViz('file1.csv')

In [None]:
!pip install dataprep

In [None]:
cow=df.sample(frac=0.1)

In [None]:
from dataprep.eda import plot

plot(cow)

In [None]:
plot(cow, 'label')

In [None]:
from dataprep.eda import plot_correlation

In [None]:
# plots the most correlated columns to column "label"
plot_correlation(cow,'label')

In [None]:
from dataprep.eda import create_report
create_report(cow)

We notice that there no null values here

**describe() analyzes numeric as well as to object series or series of a DataFrame**

In [None]:
df.describe().T

In [None]:
#checking the correlation between all the features
plt.figure(figsize=(10,10))
cor = df.corr()
sns.heatmap(cor,annot = True, cmap="coolwarm", fmt = '.0%' )
plt.show()

In [None]:
#understanding our target variable
sns.countplot(df['label'])

In [None]:
#checking for outliers using boxplot
fig = plt.figure(figsize = (6,6))

#calling the boxplot
sns.boxplot(data = df, linewidth = 1) #linewidth -- space between each inforamation

plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
#checking the outliers for acc_x using boxplot
fig = plt.figure(figsize = (6,6))

#calling the boxplot
sns.boxplot(data =df,y = "acc_x")
plt.xticks(rotation = "vertical")
plt.show()

In [None]:
#defining a variable named column_name n giving it all the column names expect time and label
col_name = ['acc_x', 'acc_y', 'acc_z', 'gyr_x', 'gyr_y', 'gyr_z', 'mag_x','mag_y', 'mag_z']

In [None]:
#seperating the x and y variables
x = df.drop('label', axis = 1) #independent features
y = df['label'] #dependent features

In [None]:
#viewing the independent features
x.head(2)

In [None]:
#viewing the dependent features
y.head(2)

## checking for outliers

#a function to plot the outliers for the given column name
def identify_outliers(df, column_name):
    fig = plt.figure(figsize=(6,8))
    sns.boxplot(data = x, y = column_name)
plt.show()

italicised text identifying outliers

In [None]:
# Using a for loop inside a function to get the box plots(seaborn) of all the columns
def identify_outliers(df, col_name):
    for i in col_name:
        fig = plt.figure(figsize=(6,8))
        sns.boxplot(data = x, y = i)
plt.show()

In [None]:
identify_outliers(x, col_name)

Replacing outlier with median

*creating funciton to replace outlier values with the median value*


In [None]:
def replace_outlier(x, col_name):
    for i in col_name:
        print('col_name : ',i)
        Q1 = np.percentile(x[i], 25)
        Q2 = np.percentile(x[i], 50)
        Q3 = np.percentile(x[i], 75)
        IQR = Q3 - Q1
        print('Q1 =',Q1,'Q2 = ',Q2,'Q3 = ',Q3)

        upper_val = Q3 + (1.5 * IQR)
        print('upper', upper_val)
        lower_val = Q1 - (1.5 * IQR)
        print('lower', lower_val)

        x.loc[x[i] > upper_val, i] = np.median(x[i])
        x.loc[x[i] < lower_val, i] = np.median(x[i])
        
        fig = plt.figure(figsize = (6,8))
        sns.boxplot(data = x,y = i)
        plt.xticks(rotation = 'horizontal')
        plt.show()

In [None]:
replace_outlier(x, col_name)

looking for outliers after imputing with median

In [None]:
identify_outliers(x, col_name)

**Making sub sets from the main dataframe**

In [None]:
# Sample datasets
df1=df.sample(500000)
df2= df.sample(250000)
df3=df.sample(180000)

In [None]:
#seperating the x and y variables
x1 = df1.drop('label', axis = 1) 
y1 = df1['label'] 

In [None]:
#seperating the x and y variables
x2 = df2.drop('label', axis = 1) 
y2 = df2['label'] 

In [None]:
#seperating the x and y variables
x3 = df3.drop('label', axis = 1) 
y3 = df3['label'] 

In [None]:
df1.shape

**Normalization using minmaxscaler**

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaling = MinMaxScaler()

In [None]:
scaling.fit_transform(x)

In [None]:
scaling.fit_transform(x1)

In [None]:
scaling.fit_transform(x2)

In [None]:
scaling.fit_transform(x3)

In [None]:
x_nor= pd.DataFrame(scaling.fit_transform(x),columns = x.columns)

In [None]:
x1_nor= pd.DataFrame(scaling.fit_transform(x1),columns = x1.columns)

In [None]:
x2_nor= pd.DataFrame(scaling.fit_transform(x2),columns = x2.columns)

In [None]:
x3_nor= pd.DataFrame(scaling.fit_transform(x3),columns = x3.columns)

In [None]:
#select k best
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

###Feature selection using KBest

In [None]:
#ranking the features
select_k_best_rank_features = SelectKBest(score_func = chi2, k = 5)
k_best_features = select_k_best_rank_features.fit(x1_nor,y1)

df_k_scores = pd.DataFrame(k_best_features.scores_, columns = ['score'])
dfcolumns = pd.DataFrame(x_nor.columns)

k_best_feature_rank = pd.concat([dfcolumns, df_k_scores], axis = 1)

k_best_feature_rank.columns = ('features', 'k_score')
print(k_best_feature_rank.nlargest(7, 'k_score'))

In [None]:
x1.columns

In [None]:
x1_nor=x1_nor.drop(['gyr_x', 'gyr_y', 'gyr_z'],axis=1)

In [None]:
x2_nor=x2_nor.drop(['gyr_x', 'gyr_y', 'gyr_z'],axis=1)

In [None]:
x3_nor=x3_nor.drop(['gyr_x', 'gyr_y', 'gyr_z'],axis=1)

In [None]:
x1_nor.columns

###Splitting the data in to train & test

In [None]:
from sklearn.model_selection import train_test_split as tts
x1_train,x1_test,y1_train,y1_test = tts(x1_nor,y1,test_size=0.2,random_state=10,stratify=y1)

**Mutual information between two random variables is a non-negative value, which measures the dependency between the variables**

In [None]:
from sklearn.feature_selection import mutual_info_classif
mutual_info = mutual_info_classif(x1_train, y1_train)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = x1_train.columns
mutual_info.sort_values(ascending=False)

**Plotting MI score**

In [None]:
mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 8))

###Decision Tree algorithm with SMOTE Oversampling & Undersampling

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y1_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y1_train == 0)))
  
# import SMOTE module from imblearn library
# pip install imblearn (if you don't have imblearn in your system)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(x1_train, y1_train.ravel())
  
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
# Create Decision Tree classifier object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifier
clf = clf.fit(X_train_res, y_train_res)
#Predict the response for test dataset
y_pred = clf.predict(x1_test)

In [None]:
# print classification report
print(classification_report(y1_test, y_pred))

In [None]:
print("Before Undersampling, counts of label '1': {}".format(sum(y1_train == 1)))
print("Before Undersampling, counts of label '0': {} \n".format(sum(y1_train == 0)))
  
# apply near miss
from imblearn.under_sampling import NearMiss
nr = NearMiss()
  
X_train_miss, y_train_miss = nr.fit_resample(x1_train, y1_train.ravel())
  
print('After Undersampling, the shape of train_X: {}'.format(X_train_miss.shape))
print('After Undersampling, the shape of train_y: {} \n'.format(y_train_miss.shape))
  
print("After Undersampling, counts of label '1': {}".format(sum(y_train_miss == 1)))
print("After Undersampling, counts of label '0': {}".format(sum(y_train_miss == 0)))

In [None]:
# Create Decision Tree classifier object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifier
clf = clf.fit(X_train_miss, y_train_miss)
#Predict the response for test dataset
y1_pred = clf.predict(x1_test)

In [None]:
print(classification_report(y1_test, y1_pred))

### Random forest Classifier 

In [None]:
x2_train,x2_test,y2_train,y2_test=tts(x2_nor,y2,test_size=0.2,random_state=10)

In [None]:
from sklearn.ensemble import RandomForestClassifier  
classifier= RandomForestClassifier(n_estimators= 10, criterion="entropy")  
classifier.fit(x2_train, y2_train)  

In [None]:
#Predicting the test set result  
y2_pred= classifier.predict(x2_test)  

In [None]:
print(classification_report(y2_test, y2_pred))

### K nearest Neighbours Classifier 

In [None]:
x3_train,x3_test,y3_train,y3_test=tts(x3_nor,y3,test_size=0.2,random_state=10)

In [None]:
# Import module for KNN
from sklearn.neighbors import KNeighborsClassifier
# Create KNN instance
# n_neighbors -> argument identifies the amount of neighbors used to ID classification
knn = KNeighborsClassifier(n_neighbors=3)
# Fit (i.e. traing) the model
knn.fit(x3_train, y3_train)

In [None]:
pred = knn.predict(x3_test)
# Review the predictions
pred

In [None]:
print(classification_report(y3_test, pred))

### Support Vector Machine or SVM 

In [None]:
from sklearn.svm import SVC # "Support vector classifier"  
classifier = SVC(kernel='linear', random_state=0)  
classifier.fit(x2_train, y2_train)  

In [None]:
#Predicting the test set result  
ypr= classifier.predict(x2_test)  

In [None]:
print(classification_report(y2_test, ypr))

###XGBoost Classifier

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
model=XGBClassifier()
grid = {'gamma':[2],'reg_lambda':[4],'eta':[.01,.03,.04,.05]}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
grid_result = grid_search.fit(x2_nor, y2)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


In [None]:
#  https://xgboost.readthedocs.io/en/stable/parameter.html


In [None]:
# Parameter selection
from xgboost import XGBClassifier
XGBc = XGBClassifier(eta=0.04,gamma=2,reg_lambda=4)# Parameter Selcted 
XGBc.fit(x2_train,y2_train)
y11_pred=XGBc.predict(x2_test)
y22_pred1=XGBc.predict(x2_train)


In [None]:
print(classification_report(y2_test, y11_pred))