### Import Dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

### Loading the dataset

In [None]:
#Import Training and Testing Data for preprocessing

training_data = pd.read_csv("TRAINING_.csv",index_col=0,header=0,delimiter=" *, *")
testing_data = pd.read_csv("TEST_.csv",header=0,delimiter=" *, *")
print(training_data.shape)
print(testing_data.shape)

In [None]:
training_data.head()

In [None]:
testing_data.head()

In [None]:
training_data=training_data[['Area(total)','Troom','Nbedrooms','Nbwashrooms',
                             'Twashrooms','roof','Roof(Area)','Lawn(Area)','Nfloors','API','ANB','EXPECTED','Grade']]

In [None]:
training_data.columns

In [None]:
training_data.head()

### Shape of dataset

In [None]:
print(training_data.describe())

print(training_data.shape)

print(training_data.dtypes)

### finding the missing values

In [None]:
print(training_data.isnull().sum())
print(testing_data.isnull().sum())

In [None]:
for i in training_data.columns:
    print()
    print(i, "--"*25)
    print(list(training_data[i].unique()))

In [None]:
#replacing lower case to uppercase
training_data['roof'] = training_data['roof'].str.replace('no','NO')
training_data['roof'] = training_data['roof'].str.replace('yes','YES')

In [None]:
training_data['roof'].mode()

In [None]:
### replacing nan value to mean or mode according to the datatype of column

In [None]:
for value in ["Troom","Nbedrooms","Nbwashrooms","Twashrooms","roof"]:
    training_data[value].fillna(training_data[value].mode()[0],inplace=True)

In [None]:
training_data['API'].fillna(float(round(training_data['API'].mean())),inplace=True)

In [None]:
# float(round(training_data['API'].mean()))

In [None]:
# we found out that in roof column we got "NO" as mode value so in Roof(Area) column we replaced the missing value by "Zero"
training_data['Roof(Area)'].fillna(float(0),inplace=True)

In [None]:
# float(round(training_data['Roof(Area)'].mean()))

In [None]:
training_data['Lawn(Area)'].fillna(float(round(training_data['Lawn(Area)'].mean())),inplace=True)

In [None]:
training_data["EXPECTED"]=training_data["EXPECTED"].str.rstrip("$")

In [None]:
training_data["EXPECTED"] = training_data["EXPECTED"].apply(lambda x: int(x))

In [None]:
# finding the missing values

print(training_data.isnull().sum())

#### we will handle the missing values in test data also

In [None]:
#replacing lower case to uppercase
testing_data['roof'] = testing_data['roof'].str.replace('no','NO')
testing_data['roof'] = testing_data['roof'].str.replace('yes','YES')

In [None]:
testing_data['roof'].mode()

In [None]:
# replacing nan value to mean or mode according to the datatype of column

for value in ["Troom","Nbedrooms","Nbwashrooms","Twashrooms","roof"]:
    testing_data[value].fillna(testing_data[value].mode()[0],inplace=True)

In [None]:
testing_data['API'].fillna(float(round(testing_data['API'].mean())),inplace=True)

In [None]:
float(round(testing_data['API'].mean()))

In [None]:
#As we found out that in roof column we got "NO" as mode value so in Roof(Area) column we replaced the missing value by "Zero"
testing_data['Roof(Area)'].fillna(float(0),inplace=True)

In [None]:
testing_data['Lawn(Area)'].fillna(float(round(training_data['Lawn(Area)'].mean())),inplace=True)

In [None]:
testing_data["EXPECTED"]=testing_data["EXPECTED"].str.rstrip("$")

In [None]:
testing_data["EXPECTED"] = testing_data["EXPECTED"].apply(lambda x: int(x))

In [None]:
print(testing_data.isnull().sum())

### Data Visualizations

In [None]:
def create_piechart(data, column):
    """
    objective: Create piechart for categorical varaibles present in pandas Dataframe

    parameter:
        data: this is pandas dataframe
        colimn: thie is column name which is used to create plot

    return:
        this will show piechart
    """
    labels = list(data[column].value_counts().to_dict().keys())
    sizes = list(data[column].value_counts().to_dict().values())

    plt.pie(sizes,
           labels=labels,
           autopct='%1.2f%%',
           shadow=False,
           startangle=45)

    plt.axis('equal')
    plt.title("Piechart - {}".format(column))
    plt.show()

In [None]:
create_piechart(training_data, "Grade")

### Label Encoding

In [None]:
training_data["roof"] = training_data["roof"].map({"YES":1,"NO":0})

In [None]:
testing_data["roof"] = testing_data["roof"].map({"YES":1,"NO":0})

In [None]:
training_data["Grade"] = training_data["Grade"].map({"A":1,"B":2,"C":3,"D":4,"E":5})

In [None]:
print(training_data.head())

print(testing_data.head())

In [None]:
print(training_data.dtypes)

### Splitting the data into X and y

In [None]:
X_train = training_data[['Area(total)','Troom','Nbedrooms','Nbwashrooms','Twashrooms','roof','Roof(Area)','Lawn(Area)','Nfloors','API','ANB','EXPECTED']]
y_train = training_data['Grade']

In [None]:
X_test = testing_data[['Area(total)','Troom','Nbedrooms','Nbwashrooms','Twashrooms','roof','Roof(Area)','Lawn(Area)','Nfloors','API','ANB','EXPECTED']]

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_train = y_train.astype(int)

In [None]:
y_train

#### Using Standard Scaler

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)

X_test = scaler.transform(X_test)

In [None]:
print(X_train.shape)
print(X_test.shape)

#### Using GradientBoostingclassifier in decisiontree

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model_GradientBoosting = GradientBoostingClassifier(
                                       n_estimators=150,
                                      random_state=20)
#fir the model on the data oand predict the values
model_GradientBoosting.fit(X_train,y_train)


y_pred=model_GradientBoosting.predict(X_test)

In [None]:
testing_data = pd.read_csv("TEST_.csv")

In [None]:
testing_data["Grade"] = y_pred

In [None]:
testing_data["Grade"]=testing_data["Grade"].map({1: "A", 2: "B", 3:"C", 4:"D", 5:"E"})

In [None]:
testing_data.head()

In [None]:
A = testing_data[['id','Grade']]

In [None]:
A.to_csv("Submission_Gradient(6)", index=False)