In [217]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [218]:
train_data = pd.read_csv("train.csv")

In [219]:
train_data.head()

Unnamed: 0,Property_ID,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review,Habitability_score
0,0x21e3,Apartment,106,,1,Semi_Furnished,0.0,No,Once in a day - Morning,5.89,Slightly below average,Medium,90.0,3.86,71.98
1,0x68d4,Apartment,733,2.0,2,Unfurnished,1.0,No,Once in a day - Evening,4.37,Well below average,Medium,96.0,3.55,71.2
2,0x7d81,Apartment,737,4.0,2,Fully Furnished,0.0,No,Once in a day - Morning,7.45,Slightly below average,Medium,121.0,3.81,71.39
3,0x7a57,Apartment,900,3.0,2,Unfurnished,2.0,Yes,Once in a day - Morning,6.16,Well above average,Medium,100.0,1.34,31.46
4,0x9409,Bungalow,2238,14.0,6,Fully Furnished,0.0,No,All time,5.46,Well below average,Medium,116.0,4.77,93.7


In [220]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39499 entries, 0 to 39498
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Property_ID             39499 non-null  object 
 1   Property_Type           39499 non-null  object 
 2   Property_Area           39499 non-null  int64  
 3   Number_of_Windows       37845 non-null  float64
 4   Number_of_Doors         39499 non-null  int64  
 5   Furnishing              38457 non-null  object 
 6   Frequency_of_Powercuts  38116 non-null  float64
 7   Power_Backup            39499 non-null  object 
 8   Water_Supply            39499 non-null  object 
 9   Traffic_Density_Score   39499 non-null  float64
 10  Crime_Rate              38712 non-null  object 
 11  Dust_and_Noise          38280 non-null  object 
 12  Air_Quality_Index       39499 non-null  float64
 13  Neighborhood_Review     39499 non-null  float64
 14  Habitability_score      39499 non-null

In [221]:
train_data.drop("Property_ID", axis=1, inplace=True)

In [222]:
train_data.shape

(39499, 14)

In [223]:
data_for_numofwindows=train_data[['Property_Type', 'Property_Area','Number_of_Doors','Number_of_Windows']]

In [224]:
data_for_numofwindows.head()

Unnamed: 0,Property_Type,Property_Area,Number_of_Doors,Number_of_Windows
0,Apartment,106,1,
1,Apartment,733,2,2.0
2,Apartment,737,2,4.0
3,Apartment,900,2,3.0
4,Bungalow,2238,6,14.0


In [225]:
non_nan_data_for_numofwindows=data_for_numofwindows.dropna()
non_nan_data_for_numofwindows.shape

(37845, 4)

In [226]:
non_nan_data_for_numofwindows

Unnamed: 0,Property_Type,Property_Area,Number_of_Doors,Number_of_Windows
1,Apartment,733,2,2.0
2,Apartment,737,2,4.0
3,Apartment,900,2,3.0
4,Bungalow,2238,6,14.0
5,Single-family home,1185,3,3.0
...,...,...,...,...
39494,Single-family home,1120,2,3.0
39495,Apartment,445,3,1.0
39496,Bungalow,3780,6,6.0
39497,Single-family home,1266,1,3.0


In [227]:
nan_data_for_numofwindows=data_for_numofwindows[data_for_numofwindows["Number_of_Windows"].isnull()]

In [228]:
nan_data_for_numofwindows.head()

Unnamed: 0,Property_Type,Property_Area,Number_of_Doors,Number_of_Windows
0,Apartment,106,1,
13,Single-family home,1359,1,
19,Apartment,497,3,
29,Bungalow,2008,5,
49,Container Home,423,1,


In [229]:
x_train=non_nan_data_for_numofwindows.drop('Number_of_Windows',axis=1)

In [230]:
x_train.shape

(37845, 3)

In [231]:
y_train=non_nan_data_for_numofwindows['Number_of_Windows']
non_nan_data_for_numofwindows.shape

(37845, 4)

In [232]:
y_train.head()

1     2.0
2     4.0
3     3.0
4    14.0
5     3.0
Name: Number_of_Windows, dtype: float64

In [233]:
x_test=nan_data_for_numofwindows.drop('Number_of_Windows',axis=1)

In [234]:
x_test.head()

Unnamed: 0,Property_Type,Property_Area,Number_of_Doors
0,Apartment,106,1
13,Single-family home,1359,1
19,Apartment,497,3
29,Bungalow,2008,5
49,Container Home,423,1


In [235]:
x_train.Property_Type.value_counts().sort_values(ascending=False).head(20)

Apartment             13965
Single-family home    11017
Duplex                 5551
Bungalow               4079
Container Home         2743
#R%$G&867               490
Name: Property_Type, dtype: int64

In [236]:
dummies = pd.get_dummies(x_train.Property_Type, prefix='type')
dummies

Unnamed: 0,type_#R%$G&867,type_Apartment,type_Bungalow,type_Container Home,type_Duplex,type_Single-family home
1,0,1,0,0,0,0
2,0,1,0,0,0,0
3,0,1,0,0,0,0
4,0,0,1,0,0,0
5,0,0,0,0,0,1
...,...,...,...,...,...,...
39494,0,0,0,0,0,1
39495,0,1,0,0,0,0
39496,0,0,1,0,0,0
39497,0,0,0,0,0,1


In [237]:
x_train=pd.concat([x_train,dummies], axis='columns')

In [238]:
x_train.drop('Property_Type', axis=1, inplace=True)

In [239]:
x_train

Unnamed: 0,Property_Area,Number_of_Doors,type_#R%$G&867,type_Apartment,type_Bungalow,type_Container Home,type_Duplex,type_Single-family home
1,733,2,0,1,0,0,0,0
2,737,2,0,1,0,0,0,0
3,900,2,0,1,0,0,0,0
4,2238,6,0,0,1,0,0,0
5,1185,3,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...
39494,1120,2,0,0,0,0,0,1
39495,445,3,0,1,0,0,0,0
39496,3780,6,0,0,1,0,0,0
39497,1266,1,0,0,0,0,0,1


In [240]:
dummies_1 = pd.get_dummies(x_test.Property_Type, prefix='type')
dummies_1

Unnamed: 0,type_#R%$G&867,type_Apartment,type_Bungalow,type_Container Home,type_Duplex,type_Single-family home
0,0,1,0,0,0,0
13,0,0,0,0,0,1
19,0,1,0,0,0,0
29,0,0,1,0,0,0
49,0,0,0,1,0,0
...,...,...,...,...,...,...
39438,0,0,0,1,0,0
39452,0,1,0,0,0,0
39456,0,1,0,0,0,0
39469,0,0,0,0,0,1


In [241]:
x_test=pd.concat([x_test,dummies_1], axis='columns')
x_test.drop('Property_Type', axis=1, inplace=True)

In [242]:
x_test

Unnamed: 0,Property_Area,Number_of_Doors,type_#R%$G&867,type_Apartment,type_Bungalow,type_Container Home,type_Duplex,type_Single-family home
0,106,1,0,1,0,0,0,0
13,1359,1,0,0,0,0,0,1
19,497,3,0,1,0,0,0,0
29,2008,5,0,0,1,0,0,0
49,423,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
39438,412,2,0,0,0,1,0,0
39452,571,1,0,1,0,0,0,0
39456,601,1,0,1,0,0,0,0
39469,1634,1,0,0,0,0,0,1


In [243]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()

In [244]:
lr.fit(x_train,y_train)

LinearRegression()

In [245]:
y_pred=lr.predict(x_test)

In [258]:
y_pred

array([2.49390836, 3.50636608, 2.51640563, ..., 2.49203857, 3.50532731,
       2.49169861])

In [246]:
y_pred.reshape(1,1654)

array([[2.49390836, 3.50636608, 2.51640563, ..., 2.49203857, 3.50532731,
        2.49169861]])

In [247]:
x_test['Number_of_Windows'] = y_pred.tolist()

In [248]:
x_test.reset_index(inplace=True, drop=True)
x_test


Unnamed: 0,Property_Area,Number_of_Doors,type_#R%$G&867,type_Apartment,type_Bungalow,type_Container Home,type_Duplex,type_Single-family home,Number_of_Windows
0,106,1,0,1,0,0,0,0,2.493908
1,1359,1,0,0,0,0,0,1,3.506366
2,497,3,0,1,0,0,0,0,2.516406
3,2008,5,0,0,1,0,0,0,9.384663
4,423,1,0,0,0,1,0,0,1.513158
...,...,...,...,...,...,...,...,...,...
1649,412,2,0,0,0,1,0,0,1.525186
1650,571,1,0,1,0,0,0,0,2.492152
1651,601,1,0,1,0,0,0,0,2.492039
1652,1634,1,0,0,0,0,0,1,3.505327


In [249]:
x_train=pd.concat([x_train,y_train], axis=1)

In [250]:
x_train.reset_index(inplace=True, drop=True)
x_train

Unnamed: 0,Property_Area,Number_of_Doors,type_#R%$G&867,type_Apartment,type_Bungalow,type_Container Home,type_Duplex,type_Single-family home,Number_of_Windows
0,733,2,0,1,0,0,0,0,2.0
1,737,2,0,1,0,0,0,0,4.0
2,900,2,0,1,0,0,0,0,3.0
3,2238,6,0,0,1,0,0,0,14.0
4,1185,3,0,0,0,0,0,1,3.0
...,...,...,...,...,...,...,...,...,...
37840,1120,2,0,0,0,0,0,1,3.0
37841,445,3,0,1,0,0,0,0,1.0
37842,3780,6,0,0,1,0,0,0,6.0
37843,1266,1,0,0,0,0,0,1,3.0


In [251]:
new_data=pd.concat([x_train,x_test], axis=0,ignore_index=True)

In [252]:
new_data

Unnamed: 0,Property_Area,Number_of_Doors,type_#R%$G&867,type_Apartment,type_Bungalow,type_Container Home,type_Duplex,type_Single-family home,Number_of_Windows
0,733,2,0,1,0,0,0,0,2.000000
1,737,2,0,1,0,0,0,0,4.000000
2,900,2,0,1,0,0,0,0,3.000000
3,2238,6,0,0,1,0,0,0,14.000000
4,1185,3,0,0,0,0,0,1,3.000000
...,...,...,...,...,...,...,...,...,...
39494,412,2,0,0,0,1,0,0,1.525186
39495,571,1,0,1,0,0,0,0,2.492152
39496,601,1,0,1,0,0,0,0,2.492039
39497,1634,1,0,0,0,0,0,1,3.505327


In [253]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39499 entries, 0 to 39498
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Property_Area            39499 non-null  int64  
 1   Number_of_Doors          39499 non-null  int64  
 2   type_#R%$G&867           39499 non-null  uint8  
 3   type_Apartment           39499 non-null  uint8  
 4   type_Bungalow            39499 non-null  uint8  
 5   type_Container Home      39499 non-null  uint8  
 6   type_Duplex              39499 non-null  uint8  
 7   type_Single-family home  39499 non-null  uint8  
 8   Number_of_Windows        39499 non-null  float64
dtypes: float64(1), int64(2), uint8(6)
memory usage: 1.1 MB


In [254]:
train_data['Number_of_Windows'] = new_data['Number_of_Windows']

In [255]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39499 entries, 0 to 39498
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Property_Type           39499 non-null  object 
 1   Property_Area           39499 non-null  int64  
 2   Number_of_Windows       39499 non-null  float64
 3   Number_of_Doors         39499 non-null  int64  
 4   Furnishing              38457 non-null  object 
 5   Frequency_of_Powercuts  38116 non-null  float64
 6   Power_Backup            39499 non-null  object 
 7   Water_Supply            39499 non-null  object 
 8   Traffic_Density_Score   39499 non-null  float64
 9   Crime_Rate              38712 non-null  object 
 10  Dust_and_Noise          38280 non-null  object 
 11  Air_Quality_Index       39499 non-null  float64
 12  Neighborhood_Review     39499 non-null  float64
 13  Habitability_score      39499 non-null  float64
dtypes: float64(6), int64(2), object(6)
mem

In [257]:
train_data.to_csv('train_data.csv')