In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

## I) Data Assessment and Understanding

### Loading Dataset

In [None]:
df = pd.read_csv('train_data.csv')
df.head(15)
#,parse_dates=['Date_of_Occupancy']

In [None]:
def data_understanding(data):
    """Function that prints all data information
        
    Args:
        data: data to be understood - dataframe
    
    Return:
        None: Prints summary info to the screen
    """
    # checking the data shape
    print('Data Shape')
    print('---------------------------------')
    print(data.shape)
    print('---------------------------------\n\n')
    
    # displaying the columns
    print('Data Columns')
    print('---------------------------------')
    print(data.columns.values)
    print('---------------------------------\n\n')
    
    # checking data info
    print('Data Info')
    print('---------------------------------')
    print(data.info())
    print('---------------------------------\n\n')

In [None]:
data_understanding(df)

In [None]:
ProfileReport(df)

In [None]:
# changing dots by null in NumberOfWindows column
df.loc[(df['NumberOfWindows']=='   .'),'NumberOfWindows'] = np.nan

In [None]:
def missing_data(dataframe):
    """Function that checks for null values and computes the percentage of null values
    Args:
        dataframe: loaded dataframe
    Return:
        dataframe: dataframe of total null values with corresponding percentages
    """
    #columns with percentage missing values
    total = dataframe.isnull().sum().sort_values(ascending=False)   # create an empty datafram
    percentage = round((total / dataframe.shape[0]) * 100, 2)
      
    Percentage_Dframe = pd.concat([total, percentage], axis=1, keys=['Total','Percentage'])
    return Percentage_Dframe

missing_data(df)

In [None]:
def duplicated_value(data):
    """Function that checks for duplicate values in rows
    Args:
        data: loaded dataframe
    Return:
        int: Return number of duplicate
    """
    print(data.duplicated().any())
    return data.duplicated().sum()
duplicated_value(df)

In [None]:
df.nunique()

In [None]:
duplicated_value(df['Customer Id'])

In [None]:
# Replace NaN values with a default value, such as -1
#df['Date_of_Occupancy'] = df['Date_of_Occupancy'].fillna(-1)
# Convert the 'float_col' column to integer
#df['Date_of_Occupancy'] = df['Date_of_Occupancy'].astype(int)

## II)Exploratory Data Analysis

**Distribution of Insured Period and Building Dimension**

In [None]:
cols = ['Insured_Period','Building Dimension']
numerical_cols=[cname for cname in df.columns if cname in cols]
histograms = df[numerical_cols].hist(figsize=(8,5), grid=False, color = '#BBC6C8')

**Insured_Period**, From the histogram, we can notice that the insured period varies between 0 and a year and that the peak is 1 year, which means that the most common insured period is a year. We can also notice that the the data is left-skewed, few insured buildings have less that a year insured period and many insured building have a year insured period.

**Building Dimension**, From the histogram, the buildings dimension varies between 1 and 20940 m2. Most of the buildings have an average dimension of 1083 m2. We can also notice that the the data is right-skewed, most of the building have less than 5000 m2 dimension and few of the builldings have more than 5000 m2 as dimension.

In [None]:


sns.boxplot(x=df['Insured_Period'], color='lightblue')

# Add labels and title
plt.xlabel('Insured_Period')
plt.title('Box Plot of Insured Period')

# Show the plot
plt.show()

The box plot is showing that many outliers at the left side of the box which confirm the skweness of the data. 

In [None]:
sns.boxplot(x=df['Building Dimension'], color='lightblue')

# Add labels and title
plt.xlabel('Building Dimension')
plt.title('Box Plot of Building Dimension')

# Show the plot
plt.show()

The box is showing that the building maximum dimension is 5000 m2, minimum dimension is 1 m2, the average dimension is around 1080 m2 and we have some outliers that even have dimension above 20000 m2. 25% of the buildings have dimension lower than 500 m2, 75% of the buildings have dimension lower than 2289 m2, the middle 50% of the buildings falls between 500 m2 and 2289 m2.

**Count Plot of Categorical Columns**

In [None]:
cols = ['YearOfObservation','Residential','Building_Painted','Building_Fenced','Garden','Settlement','Building_Type','NumberOfWindows']
cat_cols=[cname for cname in df.columns if cname in cols]
fig, axes = plt.subplots(2,4 , figsize=(16,12))
axes_it = axes.flat
for c,ax in zip(cols, axes_it):
    df[c].value_counts().plot(kind="bar", ax=ax, color="#01A9B4",xlabel = c, ylabel = 'Count')
fig.tight_layout(pad=1.0)
plt.show()

From the above graphs, the highest number of insured building occured in the year of 2012(1850) and this number keeps decreasing till 2016(100). Non residential buildings are the most insured(close to 5000), non-painted building are the most insured(above 5000). There is no much difference between the number of fenced building and non-fenced building that are insured(above 3500). same for buildings with garden and no garden, urban and rural settlement. Type 2 building are the most insured and building with 4 number of windows are the most insured.  

**Year of Occupancy Count Plot**

In [None]:
df['Date_of_Occupancy'].value_counts().head(20).plot.barh(color='#C8D2D1',figsize=(10,5),alpha=0.8)
plt.title('Year of Occupancy Count Plot')
plt.xlabel('Count',fontdict={'fontweight':'bold'})
plt.ylabel('Year of Occupancy')
plt.show()

The highest number of insured buildings were occupied in the year 1960. 

In [None]:
cols = ['Building_Painted','Residential','Building_Type','NumberOfWindows']
cat_cols=[cname for cname in df.columns if cname in cols]
fig, axes = plt.subplots(2,2 , figsize=(8,6))
axes_it = axes.flat
for c,ax in zip(cols, axes_it):
    df[c].value_counts().plot(kind="bar", ax=ax, color="#c37028",xlabel = c, ylabel = 'Count')
fig.tight_layout(pad=1.0)
plt.title('Count of insured Building based on Building Characteristics')
plt.show()

### Bivariate Analysis

**Bivariate Analysis between Features**

**Relationship between Date of Occupancy and Building Dimension**

In [None]:
k = df.groupby('Date_of_Occupancy').mean()
sns.scatterplot(x=df['Date_of_Occupancy'], y=df['Building Dimension'])

There is a weak positive relationship between building dimension and date of occupancy, building dimension slightly increase with when year increases.

**Association between Settlement and Garden**

In [None]:
# Replace categorical values to make more sense
df['Building_Painted'] = df['Building_Painted'].replace({'N':'Painted','V':'Not Painted'})
df['Building_Fenced'] = df['Building_Fenced'].replace({'N':'Fenced','V':'Not Fenced'})
df['Garden'] = df['Garden'].replace({'V':'Garden','O':'No Garden'})
df['Settlement'] = df['Settlement'].replace({'U':'Urban','R':'Rural'})
df.head(2)

In [None]:
df.to_csv('explore.csv')

In [None]:
contingency_table = pd.crosstab(df['Settlement'], df['Garden'])
print(contingency_table)
ax = contingency_table.plot(kind='bar', stacked=True, color=['#b29d94', '#fed395'])
for rect in ax.patches:
    # Find where everything is located
    height = rect.get_height()
    width = rect.get_width()
    x = rect.get_x()
    y = rect.get_y()
    
    # The height of the bar is the data value and can be used as the label
    label_text = f'{height:.0f}'  # f'{height:.2f}' to format decimal values
    
    # ax.text(x, y, text)
    label_x = x + width / 2
    label_y = y + height / 2

    # plot only when height is greater than specified value
    if height > 0:
        ax.text(label_x, label_y, label_text, ha='center', va='center', fontsize=10)
# title
plt.title('Association betwen Settlement and Garden')
        
# add plot labels
plt.xlabel('Settlement')
plt.legend(title='Garden', loc='upper right')

# show plot
plt.show()


All the buildings in the urban settlement have gardens, all but one buildings in the rural settlement have no gardens.

In [None]:
null_hypothesis = "There is no association between settlement and car garden."
alternative_hypothesis = "There is an association between settlement and car garden."
alpha = 0.05

In [None]:
from scipy.stats import chi2_contingency

In [None]:
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"The chi-square statistic is {chi2:.3f}")
print(f"The p-value is {p:.3f}")

In [None]:
if p < alpha:
    print("We reject the null hypothesis and conclude that:", alternative_hypothesis)
else:
    print("We fail to reject the null hypothesis and conclude that:", null_hypothesis)

**Association between Settlement and Building Fenced**

In [None]:
contingency_table1 = pd.crosstab(df['Settlement'], df['Building_Fenced'])
print(contingency_table1)
ax = contingency_table1.plot(kind='bar', stacked=True)
for rect in ax.patches:
    # Find where everything is located
    height = rect.get_height()
    width = rect.get_width()
    x = rect.get_x()
    y = rect.get_y()
    
    # The height of the bar is the data value and can be used as the label
    label_text = f'{height:.0f}'  # f'{height:.2f}' to format decimal values
    
    # ax.text(x, y, text)
    label_x = x + width / 2
    label_y = y + height / 2

    # plot only when height is greater than specified value
    if height > 0:
        ax.text(label_x, label_y, label_text, ha='center', va='center', fontsize=8)
# title
plt.title('Association betwen Settlement and Fenced')

# add plot labels
plt.xlabel('Settlement')
plt.legend(title='Fenced Building', loc='upper right')

# show plot
plt.show()




All the buildings in the urban settlement are not fenced, all but two buildings in the rural settlement are fenced.

In [None]:
null_hypothesis = "There is no association between settlement and fenced."
alternative_hypothesis = "There is an association between settlement and fenced."
alpha = 0.05
chi2, p, dof, expected = chi2_contingency(contingency_table1)
print(f"The chi-square statistic is {chi2:.3f}")
print(f"The p-value is {p:.3f}")

In [None]:
if p < alpha:
    print("We reject the null hypothesis and conclude that:", alternative_hypothesis)
else:
    print("We fail to reject the null hypothesis and conclude that:", null_hypothesis)

**Association between Settlement vs Building Painted**

In [None]:
contingency_table2 = pd.crosstab(df['Settlement'], df['Building_Painted'])
print(contingency_table2)
ax = contingency_table2.plot(kind='bar', stacked=True)
for rect in ax.patches:
    # Find where everything is located
    height = rect.get_height()
    width = rect.get_width()
    x = rect.get_x()
    y = rect.get_y()
    
    # The height of the bar is the data value and can be used as the label
    label_text = f'{height:.0f}'  # f'{height:.2f}' to format decimal values
    
    # ax.text(x, y, text)
    label_x = x + width / 2
    label_y = y + height / 2

    # plot only when height is greater than specified value
    if height > 0:
        ax.text(label_x, label_y, label_text, ha='center', va='center', fontsize=8)
# title
plt.title('Association betwen Settlement and Painted')

# add plot labels
plt.xlabel('Settlement')
plt.legend(title='Buiding Painted', loc='upper right')

# show plot
plt.show()

99% of the buildings in the rural settlement are not painted, 49% of buildings in the urban settlement are not painted and 51% of buildings in the urban settlement are painted.

In [None]:
null_hypothesis = "There is no association between settlement and painted."
alternative_hypothesis = "There is an association between settlement and painted."
alpha = 0.05
chi2, p, dof, expected = chi2_contingency(contingency_table2)
print(f"The chi-square statistic is {chi2:.3f}")
print(f"The p-value is {p:.3f}")

In [None]:
if p < alpha:
    print("We reject the null hypothesis and conclude that:", alternative_hypothesis)
else:
    print("We fail to reject the null hypothesis and conclude that:", null_hypothesis)

**Association between Settlement and Building Type**

In [None]:
contingency_table4 = pd.crosstab(df['Settlement'], df['Building_Type'])
print(contingency_table4)
ax = contingency_table4.plot(kind='bar', stacked=True,color =['#ED413E','#EFB786','#A78B71','#E1DCE0'])
for rect in ax.patches:
    # Find where everything is located
    height = rect.get_height()
    width = rect.get_width()
    x = rect.get_x()
    y = rect.get_y()
    
    # The height of the bar is the data value and can be used as the label
    label_text = f'{height:.0f}'  # f'{height:.2f}' to format decimal values
    
    # ax.text(x, y, text)
    label_x = x + width / 2
    label_y = y + height / 2

    # plot only when height is greater than specified value
    if height > 0:
        ax.text(label_x, label_y, label_text, ha='center', va='center', fontsize=8)
# title
plt.title('Association betwen Settlement and Building Type')
# add plot labels
plt.xlabel('Building_Type')
plt.legend(title='Building_Type', loc='upper right')

# show plot
plt.show()

From the above graph, the number of building types is almost the same for settlement rural and urban.

In [None]:
null_hypothesis = "There is no association between settlement and building type."
alternative_hypothesis = "There is an association between settlement and building type."
hi2, p, dof, expected = chi2_contingency(contingency_table4)
print(f"The chi-square statistic is {chi2:.3f}")
print(f"The p-value is {p:.3f}")

In [None]:
if p < alpha:
    print("We reject the null hypothesis and conclude that:", alternative_hypothesis)
else:
    print("We fail to reject the null hypothesis and conclude that:", null_hypothesis)

**Association between Building type and Residential**

In [None]:
contingency_table5 = pd.crosstab(df['Building_Type'], df['Residential'])
print(contingency_table5)
ax = contingency_table5.plot(kind='bar', stacked=True)
#ax.set_yticks([])
for rect in ax.patches:
    # Find where everything is located
    height = rect.get_height()
    width = rect.get_width()
    x = rect.get_x()
    y = rect.get_y()
    
    # The height of the bar is the data value and can be used as the label
    label_text = f'{height:.0f}'  # f'{height:.2f}' to format decimal values
    
    # ax.text(x, y, text)
    label_x = x + width / 2
    label_y = y + height / 2

    # plot only when height is greater than specified value
    if height > 0:
        ax.text(label_x, label_y, label_text, ha='center', va='center', fontsize=8)
# title
plt.title('Association betwen Building Type and Residential Building')
# add plot labels
plt.xlabel('Building_Type')
plt.legend(title='Residential', loc='upper right')

# show plot
plt.show()

In [None]:
from matplotlib.ticker import FuncFormatter
contingency_table5['Residential_Percent'] = contingency_table5[1]/(contingency_table5[0]+contingency_table5[1])
#contingency_table5['Residential_Percent'] = contingency_table5['Residential_Percent'].apply(lambda x: '{:.2%}'.format(x))
# create a bar chart using the percentage values
#print(contingency_table5.index)
plt.bar(contingency_table5.index,contingency_table5['Residential_Percent'])

# set the y-axis limits to 0% and 100%
plt.ylim(0, 1)
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
plt.title('Proportion of Residential Building for Each Building Type')
plt.xlabel('Building_Type')
plt.ylabel('Percentage')
# show the plot
plt.show()

Buildings of type one and two have less residential buildings(less than 30%) unlike buildings of type 3 and 4 which have more residential buildings(more than 50% of type 3 and type 4 buildings are residential).

In [None]:
print(contingency_table5)
ax = contingency_table5['Residential_Percent'].plot(kind='bar', stacked=False)
#ax.annotate(f'{v} ({v/sum(y)*100:.1f}%)', xy=(i, v), ha='center', va='bottom')

ax.set_yticks([])
#print(col,"has","%.3f%%" %(col_percent)," missing values")
for rect in ax.patches:
    # Find where everything is located
    height = rect.get_height()
    width = rect.get_width()
    x = rect.get_x()
    y = rect.get_y()
    
    # The height of the bar is the data value and can be used as the label
    label_text = f'{height:.0f}'  # f'{height:.2f}' to format decimal values
    
    # ax.text(x, y, text)
    label_x = x + width / 2
    label_y = y + height / 2

    # plot only when height is greater than specified value
    if height > 0:
        ax.text(label_x, label_y, label_text, ha='center', va='center', fontsize=8)
# title
plt.title('Proportion of Residential Building for Each Building Type')
# add plot labels
plt.xlabel('Building_Type')
plt.legend(loc='upper right')

# show plot
plt.show()

## Target

In [None]:
sns.countplot(x=df['Claim'], palette=['#F55050','#E8D2A6'])
plt.title('Count of Claims')
plt.xlabel('Count',fontdict={'fontweight':'bold'})
plt.ylabel('Claim')
plt.show()

In [None]:
non_claim = df[df['Claim']==0]['Claim'].value_counts()
claim = df[df['Claim']==1]['Claim'].value_counts()
claim

In [None]:
(1634/5526)*100

## Target and Features

**How does number of windows influence claims?** 

In [None]:
contingency_table6 = pd.crosstab(df['NumberOfWindows'], df['Claim'])
print(contingency_table6)
ax = contingency_table6.plot(kind='bar', stacked=True, color =['#E1DCE0','#CEAD6D'])
for rect in ax.patches:
    # Find where everything is located
    height = rect.get_height()
    width = rect.get_width()
    x = rect.get_x()
    y = rect.get_y()
    
    # The height of the bar is the data value and can be used as the label
    label_text = f'{height:.0f}'  # f'{height:.2f}' to format decimal values
    
    # ax.text(x, y, text)
    label_x = x + width / 2
    label_y = y + height / 2

    # plot only when height is greater than specified value
    if height > 0:
        ax.text(label_x, label_y, label_text, ha='center', va='center', fontsize=8)
# title
plt.title('Count of Claims based on Number of Windows')
# add plot labels
plt.xlabel('Number of Windows')
plt.legend(title='Claim', loc='upper right')

# show plot
plt.show()

In [None]:
contingency_table6['Percentage']=contingency_table6[1]/(contingency_table6[0]+contingency_table6[1])
contingency_table6
plt.bar(contingency_table6.index,contingency_table6['Percentage'], color='#CEAD6D')

# set the y-axis limits to 0% and 100%
plt.ylim(0, 1)
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
plt.title('Proportion of Claims based on Number of Windows')
plt.xlabel('Number of Windows')
plt.ylabel('Claim Percentage')
# show the plot
plt.show()

There is high percentage of claim among bulding with high number of windows unlike building with low number of windows with low percentage of claim

**How does Building Type influence claims?**

In [None]:
contingency_table7 = pd.crosstab(df['Building_Type'], df['Claim'])
print(contingency_table7)
ax = contingency_table7.plot(kind='bar', stacked=True, color =['#E1DCE0','#CEAD6D'])
for rect in ax.patches:
    # Find where everything is located
    height = rect.get_height()
    width = rect.get_width()
    x = rect.get_x()
    y = rect.get_y()
    
    # The height of the bar is the data value and can be used as the label
    label_text = f'{height:.0f}'  # f'{height:.2f}' to format decimal values
    
    # ax.text(x, y, text)
    label_x = x + width / 2
    label_y = y + height / 2

    # plot only when height is greater than specified value
    if height > 0:
        ax.text(label_x, label_y, label_text, ha='center', va='center', fontsize=8)
# title
plt.title('Count of Claims based on Building Type')
# add plot labels
plt.xlabel('Building_Type')
plt.legend(title='Building_Type', loc='upper right')

# show plot
plt.show()

In [None]:
contingency_table7['Percentage']=contingency_table7[1]/(contingency_table7[0]+contingency_table7[1])
contingency_table7
plt.bar(contingency_table7.index,contingency_table7['Percentage'], color='#CEAD6D')

# set the y-axis limits to 0% and 100%
plt.ylim(0, 1)
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

plt.title('Proportion of Claims for each Building Type ')
plt.xlabel('Building_Type')
plt.ylabel('Claim Percentage')

# show plot
plt.show()

There is high percentage of claim among bulding with high type unlike building with low type with low percentage of claim

**How does settlement influence claims?**

In [None]:
contingency_table8 = pd.crosstab(df['Settlement'], df['Claim'])
print(contingency_table8)
ax = contingency_table8.plot(kind='bar', stacked=True)
for rect in ax.patches:
    # Find where everything is located
    height = rect.get_height()
    width = rect.get_width()
    x = rect.get_x()
    y = rect.get_y()
    
    # The height of the bar is the data value and can be used as the label
    label_text = f'{height:.0f}'  # f'{height:.2f}' to format decimal values
    
    # ax.text(x, y, text)
    label_x = x + width / 2
    label_y = y + height / 2

    # plot only when height is greater than specified value
    if height > 0:
        ax.text(label_x, label_y, label_text, ha='center', va='center', fontsize=8)
# title
plt.title('Association between Settlement and Claim')
# add plot labels
plt.xlabel('Settlement')
plt.legend(title='Settlement', loc='upper right')

# show plot
plt.show()                       

In [None]:
contingency_table8['Percentage']=contingency_table8[1]/(contingency_table8[0]+contingency_table8[1])
contingency_table8
plt.bar(contingency_table8.index,contingency_table8['Percentage'])

# set the y-axis limits to 0% and 100%
plt.ylim(0, 1)
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
# title
plt.title('Proportion of Claims for each Settlement')
# add plot labels
plt.xlabel('Settlement')
plt.ylabel('Claim Percentage')
# show plot
plt.show()

There is higher percent of claim in the rural settlement(25%) than urban settlement(20%)

**How does building dimension influence claims?**

In [None]:
t = df.groupby('Claim').median()
t

In [None]:
l = df[df['Building Dimension']<1000]

In [None]:
sns.countplot(x=l['Claim'])
plt.title('Total claim for building dimension less than 1000 m2')
plt.show()

In [None]:
h = df[df['Building Dimension']>=1000]
sns.countplot(x=h['Claim'])
plt.title('Total claim for building dimension greater than and equal to 1000 m2')
plt.show()

Building having dimension less than 1000 m2 have less number of claims unlike building having dimension equal and greater than 1000 m2. Even though there are more insured building with dimension less than 1000 m2.

In [None]:
df[['Claim','Building Dimension']].corr()

### Multivariate Analysis

**Relationship between building dimension, settlement and claim**

In [None]:
sns.catplot(x="Settlement", y="Building Dimension", hue='Claim', data=df, kind="bar", ci=None)
plt.title('Relationship between building dimension, settlement and claim')
plt.show()

From the above graph, we can see that building with high dimension and located in the rural aettlement have more claims. 

**Relationship between building dimension, number of windows and claim**

In [None]:
sns.catplot(x="NumberOfWindows", y="Building Dimension", hue='Claim', data=df, kind="bar", ci=None, color='#ae7954')
plt.title('Relationship between building dimension, number of windows and claim')
plt.show()

From the above graph, Building with high dimension and high number of windows have more claims.

**Relationship between date of occupancy, building dimension and claim**

In [None]:
sns.catplot(x="Date_of_Occupancy", y="Building Dimension", order=df['Date_of_Occupancy'].value_counts().index[:10],hue='Claim', data=df, kind="bar", ci=None)
plt.title('Relationship between date of occupancy, building dimension and claim')
plt.show()

In [None]:
df['Date_of_Occupancy'].dtype

## Data Imputation

**Garden**

In [None]:
if (df['Settlement']=='Urban').any():
    df['Garden'] = df['Garden'].fillna('No Garden')
elif (df['Settlement']== 'Urban').any():
    df['Garden'] = df['Garden'].fillna('Garden')
df['Garden'].isna().sum()

**Date of Occupancy**

In [None]:
urb = (df[df['Settlement']== 'Urban']['Date_of_Occupancy']).median()
ru = (df[df['Settlement']== 'Rural']['Date_of_Occupancy']).median()

In [None]:
if (df['Settlement']=='Urban').any():
    df['Date_of_Occupancy'].fillna(urb, inplace=True)
elif (df['Settlement']=='Rural').any():
    df['Date_of_Occupancy'].fillna(ru, inplace=True)
df['Date_of_Occupancy'].isna().sum()

**Number of Windows**

In [None]:
df['NumberOfWindows'].fillna('No Information', inplace =True)
df['NumberOfWindows'].isna().sum()

**Building Dimension**

In [None]:
fenc = (df[df['Building_Fenced']== 'Fenced']['Building Dimension']).mean()
notfenc = (df[df['Building_Fenced']== 'Not Fenced']['Building Dimension']).mean()

In [None]:
if (df['Building_Fenced']== 'Fenced').any():
    df['Building Dimension'].fillna(fenc, inplace=True)
elif (df['Building_Fenced']== 'Not Fenced').any():
    df['Building Dimension'].fillna(notfenc, inplace=True)
df['Building Dimension'].isna().sum()

In [None]:
(df[df['Building_Fenced']=='Not Fenced']['Building Dimension']).isna().sum()

In [None]:
df.head(10)

## Feature Engineering


**Number of Windows**

In [None]:
lower_than_5 = ['1','2','3','4']
between_5_10 = ['5','6','7','8','9']
for i in lower_than_5:
    df['NumberOfWindows'].replace(i,'<5', inplace=True)
for i in between_5_10:
    df['NumberOfWindows'].replace(i,'>=5 & <10', inplace=True)


**Date of Occupancy**

In [None]:
# Categorizing Date of Occupancy into 6 Categories
# Between 1500 & 1600: 1500 is excluded and 1600 is included
# Between 1600 & 1700: 1600 is excluded and 1700 is included
# Between 1700 & 1800: 1700 is excluded and 1800 is included
# Between 1800 & 1900: 1800 is excluded and 1900 is included
# Between 1900 & 2000: 1900 is excluded and 2000 is included
# Between 2000 & 2100: 2000 is excluded and 2100 is included

#df['Date_of_Occupancy'] = pd.cut(x = df['Date_of_Occupancy'], bins=[1500,1600,1700,1800,1900,2000,2100], labels=['Between 1500 & 1600', 'Between 1600 & 1700', 'Between 1700 & 1800','Between 1800 & 1900','Between 1900 & 2000','Between 2000 & 2100'])
df['Building age'] = df['YearOfObservation']-df['Date_of_Occupancy']
df

**Insured Period**

In [None]:
threshold = 1
# Use np.where() to create two categories based on the threshold
df['Insured_Period'] = np.where(df['Insured_Period'] < threshold,'Less than 1', 'Equal to 1')
df

In [None]:
#df_encoded = pd.get_dummies(df, columns=['Insured_Period','Building_Painted','Building_Fenced','Garden','Settlement','Date_of_Occupancy','NumberOfWindows'])
#df_encoded.drop(['Customer Id','Building_Fenced_Not Fenced','Geo_Code','Insured_Period_Less than 1','Building_Painted_Not Painted','Building_Fenced_Not Fenced','Garden_No Garden','Settlement_Urban','Date_of_Occupancy_Between 2000 & 2100','NumberOfWindows_No Information'],inplace=True, axis=1)
#df_encoded = df_encoded[['Building Dimension','YearOfObservation','Insured_Period_Equal to 1','Residential','Building_Painted_Painted','Building_Fenced_Fenced','Garden_Garden','Settlement_Rural','Building_Type','Date_of_Occupancy_Between 1500 & 1600','Date_of_Occupancy_Between 1600 & 1700','Date_of_Occupancy_Between 1700 & 1800','Date_of_Occupancy_Between 1800 & 1900','Date_of_Occupancy_Between 1900 & 2000','NumberOfWindows_<5','NumberOfWindows_>=5 & <10','NumberOfWindows_>=10','Claim']]

df_encoded = pd.get_dummies(df, columns=['Insured_Period','Building_Painted','Building_Fenced','Garden','Settlement','NumberOfWindows'])
df_encoded.drop(['Customer Id','YearOfObservation','Building_Fenced_Not Fenced','Geo_Code','Date_of_Occupancy','Insured_Period_Less than 1','Building_Painted_Not Painted','Building_Fenced_Not Fenced','Garden_No Garden','Settlement_Urban','NumberOfWindows_No Information'],inplace=True, axis=1)
df_encoded = df_encoded[['Building Dimension','Insured_Period_Equal to 1','Residential','Building_Painted_Painted','Building_Fenced_Fenced','Garden_Garden','Settlement_Rural','Building_Type','Building age','NumberOfWindows_<5','NumberOfWindows_>=5 & <10','NumberOfWindows_>=10','Claim']]
df_encoded = df_encoded.rename(columns={'Building_Painted_Painted':'Painted','Building_Fenced_Fenced':'Fenced','Garden_Garden':'Garden','Settlement_Rural':'Rural'})
df_encoded

In [None]:
df_encoded.to_csv('df_encoded_2.csv')

**Dividing the dataset into Features and Target**

In [None]:
target = 'Claim'
X = df_encoded.drop([target,'YearOfObservation'], axis=1)
y = df_encoded[target]
X

**Separating dataset into train and test**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

**Feature Selection**

1) Filter Method: Mutual Information, used to measure the amount of dependence or correlation between a feature and the target variable. The reduction in uncertainty about one variable given knowledge of the other variable.

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

mi_scores = mutual_info_classif(X, y, discrete_features=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])
for i, col in enumerate(X.columns):
    print(col, (mi_scores[i]*100))

2) Embedded Method: Lasso Regression, useful when you have a large number of predictors, some of which may be irrelevant or redundant. LASSO performs both variable selection and regularization by setting some of the coefficients to exactly zero. 

In [None]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [None]:
# Standardize the features
# Create a StandardScaler object and fit it to the training data
scaler = StandardScaler()
scaler.fit(X_train['Building Dimension'].values.reshape(-1, 1)) # Use only the column you want to standardize for fitting

# Standardize the column on both the training and testing data
X_train['Building Dimension'] = scaler.transform(X_train['Building Dimension'].values.reshape(-1, 1)).flatten()
X_test['Building Dimension'] = scaler.transform(X_test['Building Dimension'].values.reshape(-1, 1)).flatten()

#alpha=0.1 # alpha is the regularization parameter
lasso = LogisticRegression(penalty='l1', C=0.1, solver='liblinear') 
lasso.fit(X_train, y_train)

# print the selected features and their coefficients
print("Selected Features:", X_train.iloc[:, lasso.coef_[0]!=0].shape[1])
print("Coefficients:", lasso.coef_[0][lasso.coef_[0]!=0])
imp_feats = X_train.columns[(lasso.coef_ != 0).ravel().tolist()]
print(imp_feats)
y_pred = lasso.predict(X_test)
# Evaluate the model on the test set
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

alpha = 1.0  # regularization strength
ridge = RidgeClassifier(alpha=alpha)

# Fit the model to the training data
ridge.fit(X_train, y_train)
coef = ridge.coef_
print(coef)

# Make predictions on the testing data
y_pred = ridge.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(r2)

# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean squared error: {mse:.2f}")

## Test Dataset

In [None]:
test_data = pd.read_csv('test_data.csv')
test_data.head()

In [None]:
missing_data(test_data)

In [None]:
# Replace categorical values to make more sense
test_data['Building_Painted'] = test_data['Building_Painted'].replace({'N':'Painted','V':'Not Painted'})
test_data['Building_Fenced'] = test_data['Building_Fenced'].replace({'N':'Fenced','V':'Not Fenced'})
test_data['Garden'] = test_data['Garden'].replace({'V':'Garden','O':'No Garden'})
test_data['Settlement'] = test_data['Settlement'].replace({'U':'Urban','R':'Rural'})
# changing dots by null in NumberOfWindows column
test_data.loc[(test_data['NumberOfWindows']=='   .'),'NumberOfWindows'] = np.nan
test_data['NumberOfWindows']

**test data imputation**

In [None]:
# Garden
if (test_data['Settlement']=='Urban').any():
    test_data['Garden'] = test_data['Garden'].fillna('No Garden')
elif (test_data['Settlement']== 'Urban').any():
    test_data['Garden'] = test_data['Garden'].fillna('Garden')
#test_data['Garden'].isna().sum()

# Building Dimension
fenc = (test_data[test_data['Building_Fenced']== 'Fenced']['Building Dimension']).mean()
notfenc = (test_data[test_data['Building_Fenced']== 'Not Fenced']['Building Dimension']).mean
if (test_data['Building_Fenced']== 'Fenced').any():
    test_data['Building Dimension'].fillna(fenc, inplace=True)
elif (test_data['Building_Fenced']== 'Not Fenced').any():
    test_data['Building Dimension'].fillna(notfenc, inplace=True)
#test_data['Building Dimension'].isna().sum()

# Date of Occupancy
urb = (test_data[test_data['Settlement']== 'Urban']['Date_of_Occupancy']).median()
ru = (test_data[test_data['Settlement']== 'Rural']['Date_of_Occupancy']).median()
if (test_data['Settlement']=='Urban').any():
    test_data['Date_of_Occupancy'].fillna(urb, inplace=True)
elif (test_data['Settlement']=='Rural').any():
    test_data['Date_of_Occupancy'].fillna(ru, inplace=True)
test_data['Date_of_Occupancy'].isna().sum()

# Number of Windows
test_data['NumberOfWindows'].fillna('No Information', inplace =True)
#test_data['NumberOfWindows'].isna().sum()

**Feature Engineering**

In [None]:
# nbr of windows
lower_than_5 = ['1','2','3','4']
between_5_10 = ['5','6','7','8','9']
for i in lower_than_5:
    test_data['NumberOfWindows'].replace(i,'<5', inplace=True)
for i in between_5_10:
    test_data['NumberOfWindows'].replace(i,'>=5 & <10', inplace=True)
#test_data['NumberOfWindows'].unique()

# date of occupancy
test_data['Building age'] = test_data['YearOfObservation']-test_data['Date_of_Occupancy']
# Use np.where() to create two categories based on the threshold

# insured period
test_data['Insured_Period'] = np.where(test_data['Insured_Period'] < threshold,'Less than 1', 'Equal to 1')
test_data

In [None]:
#test_data_encoded = pd.get_dummies(test_data, columns=['Insured_Period','Building_Painted','Building_Fenced','Garden','Settlement','NumberOfWindows'])
#test_data_encoded.drop(['Customer Id','YearOfObservation','Building_Fenced_Not Fenced','Geo_Code','Date_of_Occupancy','Insured_Period_Less than 1','Building_Painted_Not Painted','Building_Fenced_Not Fenced','Garden_No Garden','Settlement_Urban','NumberOfWindows_No Information'],inplace=True, axis=1)
test_data_encoded = test_data_encoded[['Building Dimension','Insured_Period_Equal to 1','Residential','Building_Painted_Painted','Building_Fenced_Fenced','Garden_Garden','Settlement_Rural','Building_Type','Building age','NumberOfWindows_<5','NumberOfWindows_>=5 & <10','NumberOfWindows_>=10']]
test_data_encoded = test_data_encoded.rename(columns={'Building_Painted_Painted':'Painted','Building_Fenced_Fenced':'Fenced','Garden_Garden':'Garden','Settlement_Rural':'Rural'})
test_data_encoded.head(10)

In [None]:
test_data_encoded.to_csv('test_data_encoded_2.csv')

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans

In [None]:
x = data.iloc[:,1:3]
kmeans = KMeans(3)
means.fit(x)
identified_clusters = kmeans.fit_predict(x)

data_with_clusters = data.copy()
data_with_clusters['Clusters'] = identified_clusters 
plt.scatter(data_with_clusters['Longitude'],data_with_clusters['Latitude'],c=data_with_clusters['Clusters'],cmap='rainbow')


In [None]:
import numpy as np
from sklearn.cluster import KMeans


# Create a KMeans object with 3 clusters
kmeans = KMeans(n_clusters=3)

# Fit the KMeans model to the data
kmeans.fit(df_encoded)
identified_clusters = kmeans.fit_predict(df_encoded)
identified_clusters

In [None]:
data_with_clusters = df_encoded.copy()
data_with_clusters['Clusters'] = identified_clusters 
plt.scatter(data_with_clusters['Longitude'],data_with_clusters['Latitude'],c=data_with_clusters['Clusters'],cmap='rainbow')


In [None]:
def checking_outliers(dataframe):
    """
    Function that get outliers from the dataframe

    Args:
        dataframe : pandas dataframe
            Contains the data where the outliers are to be found
    Returns:
        None: prints number of outliers
    """
    outliers = []
    for c in dataframe.columns:
        q25, q75 = np.percentile(dataframe[c], 25), np.percentile(dataframe[c], 75)
        iqr = q75 - q25
        print('Percentiles: 25th = %.3f, 75th = %.3f, IQR = %.3f' % (q25, q75, iqr))

        # calculate the outlier cutoff
        cut_off = iqr * 1.5
        lower, upper = q25 - cut_off, q75 + cut_off

        # identify outliers
        indx = np.where((dataframe[c] < lower) | (dataframe[c] > upper))
        print(c,'----------> Identified outliers: %d' % len(indx[0]))
        