# About Dataset :-

### Let’s consider a Company dataset with around 10 variables and 400 records. 
### The attributes are as follows: 
###  Sales -- Unit sales (in thousands) at each location
###  Competitor Price -- Price charged by competitor at each location
###  Income -- Community income level (in thousands of dollars)
###  Advertising -- Local advertising budget for company at each location (in thousands of dollars)
###  Population -- Population size in region (in thousands)
###  Price -- Price company charges for car seats at each site
###  Shelf Location at stores -- A factor with levels Bad, Good and Medium indicating the quality of the shelving location for the car seats at each site
###  Age -- Average age of the local population
###  Education -- Education level at each location
###  Urban -- A factor with levels No and Yes to indicate whether the store is in an urban or rural location
###  US -- A factor with levels No and Yes to indicate whether the store is in the US or not
### The company dataset looks like this: 

# Problem Statement:
### A cloth manufacturing company is interested to know about the segment or attributes causes high sale.
### Approach - A Random Forest can be built with target variable Sales (we will first convert it in categorical variable) & all other 
### variable will be independent in the analysis.

In [3]:
#### Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report

#### Getting the data
def read_data():
    try:
        data = pd.read_csv("C:/Data Science/Nov 2023/19 Nov 2023/Random Forest Project 9 & 10/10. Random Forests/Company_Data.csv")
        return data
    except Exception as e:
        print("Data Path Not Found and Error is in the Read Data Function")

#### Calling the read data function
df = read_data()
# print(df.head())

#### Preprocessing and Feature Engineering
df1 = df.copy()
# print(df1.head())
# print(df1.info())
# print(df1.shape)

# df1.isnull().sum()

# df1.select_dtypes(include='number').columns

# df1.select_dtypes(include='object').columns

# for i in ['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
#        'Age', 'Education']:
#     sns.boxplot(data = df1,x = i)
#     plt.show()

# for i in [ 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
#        'Age', 'Education']:
#     sns.regplot(data = df1,x = i,y = 'Sales' )
#     plt.show()

col = ['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
       'Age', 'Education']

def wisker (col):
    Q1,Q3 = np.percentile(col,[25,75])
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR
    return lower,upper

for i in col:
    lower,upper = wisker(df1[i])
    df1[i] = np.where(df1[i]<lower,lower,df1[i])
    df1[i] = np.where(df1[i]>upper,upper,df1[i])

# for i in ['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
#        'Age', 'Education']:
#     sns.boxplot(data = df1,x = i)
#     plt.show()

# for i in df1.select_dtypes("number"):
#     sns.histplot(data = df1,x = i,kde = True)
#     plt.show()

# for i in df1.select_dtypes("object"):
#     sns.histplot(data = df1,x = i,kde = True)
#     plt.show()

# for i in [ 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
#        'Age', 'Education']:
#     sns.scatterplot(data = df1,x = i,y = 'Sales',hue = 'ShelveLoc' )
#     plt.show()

# for i in [ 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
#        'Age', 'Education']:
#     sns.scatterplot(data = df1,x = i,y = 'Sales',hue = 'Urban' )
#     plt.show()

# for i in [ 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
#        'Age', 'Education']:
#     sns.scatterplot(data = df1,x = i,y = 'Sales',hue = 'US' )
#     plt.show()

# for i in [ 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
#        'Age', 'Education']:
#     sns.regplot(data = df1,x = i,y = 'Sales' )
#     plt.show()

# df1.columns

df1['ShelveLoc'] = df1['ShelveLoc'].replace(('Bad','Medium','Good'),(0,1,2))

df1['Urban'] = df1['Urban'].replace(('Yes','No'),(0,1))

df1['US'] = df1['US'].replace(('Yes','No'),(0,1))

# df1 =pd.get_dummies(df1[['CompPrice', 'Income', 'Advertising', 'Population', 'Price',
#        'ShelveLoc', 'Age', 'Education', 'Urban', 'US']],drop_first=True)

bins = [-float('inf'),5,10,float('inf')]
labels = ['Bad','Medium','Good']

df1['Sales'] = pd.cut(df['Sales'],bins = bins,labels = labels,right = False)

df1['Sales'] = df1['Sales'].replace(('Bad','Medium','Good'),(0,1,2))

# independent

# final_data = pd.concat([df1,independent,target],axis = 1)

# final_data

# final_data.columns

# final_data.drop(["ShelveLoc","Urban","US"],axis = 1,inplace=True)

def data_prep(df1,t):
    X = df1[['CompPrice', 'Income', 'Advertising', 'Population', 'Price',
           'ShelveLoc', 'Age', 'Education', 'Urban','US']]
    y = df1['Sales']
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=t,random_state=365)
    return X_train,X_test,y_train,y_test

def rndf_tree(X_train,X_test,y_train):
    model = RandomForestClassifier(criterion="entropy",max_depth=5)
    model_fit = model.fit(X_train,y_train)
    pred = model_fit.predict(X_test)
    return pred

def rndf_tree_diag(y_test,pred):
    acc = accuracy_score(y_test,pred)
    clf = classification_report(y_test,pred)
    return acc,clf 

def main_function():
    df1
    X_train,X_test,y_train,y_test = data_prep(df1,0.1)
    pred = rndf_tree(X_train,X_test,y_train)
    acc,clf = rndf_tree_diag(y_test,pred)
    return clf

results = main_function()
print(results)

              precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       0.82      0.93      0.87        29
           2       0.67      0.57      0.62         7

    accuracy                           0.80        40
   macro avg       0.83      0.58      0.63        40
weighted avg       0.81      0.80      0.78        40

