In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
!ls '/kaggle/input/apartment-rental-offers-in-germany'

In [None]:
df = pd.read_csv('/kaggle/input/apartment-rental-offers-in-germany/immo_data.csv')
df.head()

In [None]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('/kaggle/input/apartment-rental-offers-in-germany/immo_data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.info()

In [None]:
df.describe()

# delete columns with more than 50% null data

In [None]:
df.isna().sum()/len(df)

In [None]:
# number of null in each columns
df.columns[((df.isna().sum()/len(df)) > 0.50)]

In [None]:
df = df.drop(columns=df.columns[((df.isna().sum()/len(df)) > 0.50)])
df.columns

# delete columns with not appropriate value

In [None]:
df[df['livingSpace'] == 0.0].shape[0]

In [None]:
df[df['livingSpace'] == 0.0].index

In [None]:
df.shape

In [None]:
df = df.drop(df[df['livingSpace'] == 0.0].index)
df.shape

In [None]:
df[df['totalRent'] == 0.0].shape[0]

In [None]:
df.shape

In [None]:
df = df.drop(df[df['totalRent'] == 0.0].index)
df.shape

In [None]:
df.shape

# delete columns without useful information

In [None]:
df.head()

In [None]:
df['date'].value_counts()

In [None]:
df = df.drop(columns=['facilities','description','livingSpaceRange','scoutId','street','streetPlain','houseNumber','date'])

In [None]:
df.info()

# fillna numeric data by mean

In [None]:
df._get_numeric_data().mean()

In [None]:
df.fillna(df._get_numeric_data().mean(),inplace = True)

In [None]:
df.isna().sum()

# delete outlier

In [None]:
for cols in df.columns:
    print(df[cols].dtype)

In [None]:
print('col mean : ' ,df['serviceCharge'].mean())
print('col std : ' ,df['serviceCharge'].std())

upper_range = df['serviceCharge'].mean() + 3 * df['serviceCharge'].std()
print('upper range : ',upper_range)
lower_range = df['serviceCharge'].mean() - 3 * df['serviceCharge'].std()
print('lower range : ',lower_range)

In [None]:
df[(df['serviceCharge'] > upper_range) | (df['serviceCharge'] < lower_range)]

In [None]:
df.shape

In [None]:
for cols in df.columns:
    if df[cols].dtype == 'int64' or df[cols].dtype == 'float64':
        upper_range = df[cols].mean() + 3 * df[cols].std()
        lower_range = df[cols].mean() - 3 * df[cols].std()
        
        indexs = df[(df[cols] > upper_range) | (df[cols] < lower_range)].index
        df = df.drop(indexs)

In [None]:
df.shape

# fillna categorical data

In [None]:
for cols in df.columns:
    if df[cols].dtype == 'object' or df[cols].dtype == 'bool':
        print('column : ',cols)
        print(df[cols].value_counts().head(1))

In [None]:
for cols in df.columns:
    if df[cols].dtype == 'object' or df[cols].dtype == 'bool':
        print(df[cols].value_counts().head(1).index[0])

In [None]:
for cols in df.columns:
    if df[cols].dtype == 'object' or df[cols].dtype == 'bool':
        print('cols : {} , value : {}'.format(cols , df[cols].value_counts().head(1).index[0]))
        df[cols].fillna(df[cols].value_counts().head(1).index[0],inplace = True)

In [None]:
df.isna().sum()

# delete categorical features with lots of states

In [None]:
for cols in df.columns:
    if df[cols].dtype == 'object' or df[cols].dtype == 'bool':
        print('cols : {} , unique values : {}'.format(cols,df[cols].nunique()))

In [None]:
df = df.drop(columns=['firingTypes','geo_krs','regio2','regio3','geo_bln'])
df.shape

# reduce number of categories

In [None]:
df.info()

In [None]:
df['regio1'].value_counts()*100/len(df)

In [None]:
def edit_regio1(x):
    if x in ['Hamburg','Bremen','Saarland']:
        return 'other'
    else:
        return x
    
df['regio1_edit'] = df['regio1'].apply(edit_regio1)
df = df.drop(columns = ['regio1'])

In [None]:
df['regio1_edit'].value_counts()*100/len(df)

In [None]:
df['heatingType'].value_counts()*100 / len(df)

In [None]:
list(df['heatingType'].value_counts().tail(8).index)

In [None]:
others = list(df['heatingType'].value_counts().tail(8).index)
def edit_heating(x):
    if x in others:
        return 'other'
    else:
        return x

df['heatingType_edit'] = df['heatingType'].apply(edit_heating)
df = df.drop(columns = ['heatingType'])
df['heatingType_edit'].value_counts()*100 / len(df)

In [None]:
df['condition'].value_counts()

In [None]:
list(df['condition'].value_counts().tail(4).index)

In [None]:
others = list(df['condition'].value_counts().tail(4).index)

def edit_condition(x):
    if x in others:
        return 'other'
    else:
        return x
    
df['condition_edit'] = df['condition'].apply(edit_condition)
df = df.drop(columns=['condition'])
df['condition_edit'].value_counts()

In [None]:
df['typeOfFlat'].value_counts()

In [None]:
list(df['typeOfFlat'].value_counts().tail(2).index)

In [None]:
def edit_typeOfFlat(x):
    if x in ['half_basement', 'loft']:
        return 'half_loft'
    else:
        return x
    
df['typeOfFlat_edit'] = df['typeOfFlat'].apply(edit_typeOfFlat)
df = df.drop(columns=['typeOfFlat'])
df['typeOfFlat_edit'].value_counts()

In [None]:
df.shape

# See DataSet and Visualization (extra)

In [None]:
df.columns

In [None]:
df['regio1_edit'].value_counts()

In [None]:
df['regio1_edit'].value_counts().plot(kind='bar')

In [None]:
sns.set(font_scale=1.4)
df['regio1_edit'].value_counts().plot(kind='barh', figsize=(15, 10), rot=0)
plt.xlabel("Count of advertisements", labelpad=14)
plt.ylabel("Regions", labelpad=5)
plt.title("Advertisements of differents regions in Germany", y=1.02);

In [None]:
df.numberOfFloors.value_counts()

# Hypothesis Test

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
sns.distplot(df['livingSpace'],kde_kws={"label": 'livingSpace'}, bins=20)

In [None]:
stats.normaltest(df["livingSpace"])

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
sns.distplot(np.sqrt(df["livingSpace"]),kde_kws={"label": 'livingSpace'}, bins=20)

In [None]:
stats.normaltest(np.sqrt(df["livingSpace"]))

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
sns.distplot(np.log(df["livingSpace"]),kde_kws={"label": 'livingSpace'}, bins=20)

In [None]:
stats.normaltest(np.log(df["livingSpace"]))

In [None]:
stats.normaltest(np.log10(df["livingSpace"]))

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
sns.distplot(1/df["livingSpace"],kde_kws={"label": 'livingSpace'}, bins=20)

In [None]:
stats.normaltest(1/df["livingSpace"])

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
sns.distplot(df["livingSpace"]**2,kde_kws={"label": 'livingSpace'}, bins=20)

In [None]:
stats.normaltest(df["livingSpace"]**2)

## Spearman’s Rank Correlation

In [None]:
# q1.relation between living Space and totalRent

stats.stats.spearmanr(df['livingSpace'],df['totalRent'])

## Analysis of Variance Test (ANOVA)

In [None]:
# q2.relation between region1 and totalRent

fstat, pval = stats.f_oneway(*[df.totalRent[df.regio1_edit == s]
for s in df.regio1_edit.unique()])
print("Oneway Anova totalRent ~ edit region1 F=%.2f, p-value=%E" % (fstat, pval))

In [None]:
plt.figure(figsize=(25,8))
sns.violinplot("regio1_edit", "livingSpace", data=df)

## Chi-Squared Test

In [None]:
#q3  independence of having serviceCharge and havingKitchen

def edit(x):
    if x == 0.0:
        return 0
    else:
        return 1
    
df['has_serviceCharge'] = df['serviceCharge'].apply(edit)
stats.chi2_contingency(df[['has_serviceCharge',"livingSpace"]])

## Point Biserialr test

In [None]:
#q4 realation between having serviceCharge and livingSpace

stats.pointbiserialr(df['has_serviceCharge'],df["livingSpace"])

In [None]:
del df['has_serviceCharge']

## t-test

In [None]:
#q5 in Berlin relation hasKitchen and totalRent
berlin_df = df[df['regio1_edit'] == 'Berlin']
berlin_df.head()

In [None]:
stats.ttest_ind(berlin_df['hasKitchen'],berlin_df['totalRent'])

In [None]:
#q6 test is mean of living space is 60
stats.ttest_1samp(berlin_df['livingSpace'],popmean = 60)

In [None]:
del berlin_df

# normalizing numeric data

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
for cols in df.columns:
    if df[cols].dtype == 'int64' or df[cols].dtype == 'float64':
        if cols != 'livingSpace':
            df[cols] = ((df[cols] - df[cols].mean())/(df[cols].std()))

In [None]:
df.head()

In [None]:
df.describe()

# Correlation Matrix

In [None]:
corr = df.corr()

In [None]:
f, ax = plt.subplots(figsize=(20, 20))

sns.heatmap(corr, square = True,fmt='.2f' ,annot = True)

# convert categorical data to dummies variables

In [None]:
columns = []
for cols in df.columns:
    if df[cols].dtype == 'object' or df[cols].dtype == 'bool':
        columns.append(cols)
        
columns

In [None]:
dummies_feature = pd.get_dummies(df[columns])
dummies_feature.head()

In [None]:
dummies_feature.shape

In [None]:
df = pd.concat([df, dummies_feature], axis=1)
df.head()

In [None]:
df = df.drop(columns=columns)
df.head()

In [None]:
df.info()

# split features and target

In [None]:
y = df['livingSpace'].values
x = df.drop(columns = ['livingSpace']).values

print(x.shape)
print(y.shape)

In [None]:
train_size = int(0.8 * x.shape[0])
train_size

In [None]:
x_train = x[:train_size]
y_train = y[:train_size]

print(x_train.shape)
print(y_train.shape)

x_test = x[train_size:]
y_test = y[train_size:]

print(x_test.shape)
print(y_test.shape)

# PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(0.90)
x_pca = pca.fit_transform(x)
x_pca.shape

In [None]:
x_pca_train = x_pca[:train_size]
print(x_pca_train.shape)
x_pca_test = x_pca[train_size:]
print(x_pca_test.shape)

# Regression

## All Features

\begin{equation}
\hat{y} = w_1 * x_1 + w_2 * x_2 + \cdots + w_{61} * x_{61} + b
\end{equation}

\begin{align}
&error^{i} = \frac{1}{2}({y_{train}}^{i} - \hat{y}^{i})\\
&\mbox{MSE} = \frac{1}{N} \sum_{i = 1}{N} (error^{i})^2\\
&\mbox{MSE} = \frac{1}{N} \big((error^{0})^2 + (error^{1})^2 + \cdots + (error^{N})^2 \big)\\
&\mbox{MSE} = \frac{1}{N} \big((y_{train} ^{0} - (w_1 * x_1 ^{0} + \cdots + w_{61} * x_{61} ^{0} + b))^2 + \cdots + ((y_{train} ^{N} - (w_1 * x_1 ^{N} + \cdots + w_{61} * x_{61} ^{N} + b))^2 \big)\\
\end{align}

\begin{align}
&\frac{\partial\mbox{MSE}}{\partial w_{1}} = \frac{-2}{2N}\big(error^{0}*{x_1}^{0} + error^{1}*{x_1}^{1} + \cdots + error^{N}*{x_1}^{N}\big) = \frac{-2}{2N} \big( \sum_{i = 1}^{N} error^{i} * x_{1} ^{i} \big) \\
&\frac{\partial\mbox{MSE}}{\partial w_{2}} = \frac{-2}{2N}\big(error^{0}*{x_2}^{0} + error^{1}*{x_2}^{1} + \cdots + error^{N}*{x_2}^{N}\big) = \frac{-2}{2N} \big( \sum_{i = 1}^{N} error^{i} * x_{2} ^{i} \big) \\
&\vdots\\
&\frac{\partial\mbox{MSE}}{\partial w_{61}} = \frac{-2}{2N}\big(error^{0}*{x_{61}}^{0} + error^{1}*{x_{61}}^{1} + \cdots + error^{N}*{x_{61}}^{N}\big) = \frac{-2}{2N} \big( \sum_{i = 1}^{N} error^{i} * x_{61} ^{i} \big) \\
&\frac{\partial\mbox{MSE}}{\partial b} = \frac{-2}{2N} \big( \sum_{i = 1}^{N} error^{i} \big)
\end{align}


In [None]:
x_train[:,0].shape

In [None]:
# Initializes parameters "w" and "b" randomly
np.random.seed(42)
b = np.random.randn(1)
w = np.random.randn(61)
n = x_train.shape[0]

# Sets learning rate
lr = 0.001

# Defines number of epochs
n_epochs = 500

test_error =  y_test - np.sum((w * x_test) + b , axis = 1)
test_mse = (test_error**2).mean()           
print('init MSE : ',test_mse)
              
for epoch in range(n_epochs):
    error = y_train - np.sum((w * x_train) + b , axis = 1)
    
    if epoch % 100 == 0:    
        print('epoch {} , MSE : {}'.format(epoch,(error**2).mean()))
    
    w_grad = [0] * x_train.shape[1]
    b_grad = 0
    
    for i in range(x_train.shape[1]):
        w_grad[i] = -1 * (x_train[:,i] * error).mean()
        w[i] = w[i] - (lr * w_grad[i])
        
    b_grad = -1 * error.mean()
    b = b - (lr*b_grad)
              

test_error = y_test - np.sum((w * x_test) + b , axis = 1)
test_mse = (test_error**2).mean() 
print('Final MSE : ',test_mse)

In [None]:
y_pred = np.sum((w * x_test) + b , axis = 1)
temp = pd.DataFrame({'test':y_test,'pred':y_pred})
temp.head()

In [None]:
temp['upper_range'] = temp['test'] * 1.2
temp['lower_range'] = temp['test'] * 0.8

temp[(temp['upper_range'] >=temp['pred']) & (temp['pred'] >= temp['lower_range'])].shape[0] * 100/temp.shape[0]

In [None]:
# Initializes parameters "w" and "b" randomly
np.random.seed(42)
b = np.random.randn(1)
w = np.random.randn(61)
n = x_train.shape[0]

# Sets learning rate
lr = 0.01

# Defines number of epochs
n_epochs = 500

test_error =  y_test - np.sum((w * x_test) + b , axis = 1)
test_mse = (test_error**2).mean()           
print('init MSE : ',test_mse)
              
for epoch in range(n_epochs):
    error = y_train - np.sum((w * x_train) + b , axis = 1)
    
    if epoch % 100 == 0:    
        print('epoch {} , MSE : {}'.format(epoch,(error**2).mean()))
    
    w_grad = [0] * x_train.shape[1]
    b_grad = 0
    
    for i in range(x_train.shape[1]):
        w_grad[i] = -1 * (x_train[:,i] * error).mean()
        w[i] = w[i] - (lr * w_grad[i])
        
    b_grad = -1 * error.mean()
    b = b - (lr*b_grad)
              

test_error = y_test - np.sum((w * x_test) + b , axis = 1)
test_mse = (test_error**2).mean() 
print('Final MSE : ',test_mse)

In [None]:
y_pred = np.sum((w * x_test) + b , axis = 1)
temp = pd.DataFrame({'test':y_test,'pred':y_pred})
temp.head()

In [None]:
temp['upper_range'] = temp['test'] * 1.2
temp['lower_range'] = temp['test'] * 0.8

temp[(temp['upper_range'] >=temp['pred']) & (temp['pred'] >= temp['lower_range'])].shape[0] * 100/temp.shape[0]

## use skitlearn linear model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
Linear = LinearRegression()

In [None]:
Linear.fit(x_train,y_train)
print(Linear.coef_)
print(Linear.intercept_)

In [None]:
y_pred = Linear.predict(x_test)

In [None]:
mean_squared_error(y_pred,y_test)

In [None]:
temp = pd.DataFrame({'test':y_test,'pred':y_pred})
temp.head()

In [None]:
temp['upper_range'] = temp['test'] * 1.2
temp['lower_range'] = temp['test'] * 0.8

temp[(temp['upper_range'] >=temp['pred']) & (temp['pred'] >= temp['lower_range'])].shape[0] * 100/temp.shape[0]

## Use PCA features

In [None]:
x_pca_train.shape

In [None]:
# Initializes parameters "w" and "b" randomly
np.random.seed(42)
b = np.random.randn(1)
w = np.random.randn(x_pca_train.shape[1])
n = x_pca_train.shape[0]

# Sets learning rate
lr = 0.01

# Defines number of epochs
n_epochs = 500

test_error =  y_test - np.sum((w * x_pca_test) + b , axis = 1)
test_mse = (test_error**2).mean()           
print('init MSE : ',test_mse)
              
for epoch in range(n_epochs):
    error = y_train - np.sum((w * x_pca_train) + b , axis = 1)
    
    if epoch % 100 == 0:    
        print('epoch {} , MSE : {}'.format(epoch,(error**2).mean()))
    
    w_grad = [0] * x_pca_train.shape[1]
    b_grad = 0
    
    for i in range(x_pca_train.shape[1]):
        w_grad[i] = -1 * (x_pca_train[:,i] * error).mean()
        w[i] = w[i] - (lr * w_grad[i])
        
    b_grad = -1 * error.mean()
    b = b - (lr*b_grad)
              

test_error = y_test - np.sum((w * x_pca_test) + b , axis = 1)
test_mse = (test_error**2).mean() 
print('Final MSE : ',test_mse)

In [None]:
y_pred = np.sum((w * x_pca_test) + b , axis = 1)
temp = pd.DataFrame({'test':y_test,'pred':y_pred})
temp.head()

In [None]:
temp['upper_range'] = temp['test'] * 1.2
temp['lower_range'] = temp['test'] * 0.8

temp[(temp['upper_range'] >=temp['pred']) & (temp['pred'] >= temp['lower_range'])].shape[0] * 100/temp.shape[0]

## use just noRooms feature

In [None]:
df.head()

In [None]:
X = df['noRooms'].values
X.shape

In [None]:
X_train = X[:train_size]
print(X_train.shape)
X_test = X[train_size:]
print(X_test.shape)

In [None]:
# Initializes parameters "w" and "b" randomly
np.random.seed(42)
b = np.random.randn(1)
w = np.random.randn(1)
n = X_train.shape[0]

# Sets learning rate
lr = 0.1

# Defines number of epochs
n_epochs = 500

test_error =  y_test - ((w * X_test) + b)
test_mse = (test_error**2).mean()           
print('init MSE : ',test_mse)
print('init learning rate : ',lr)
              
for epoch in range(n_epochs):
    error = y_train - ((w * X_train) + b)
    
    if epoch % 100 == 0:    
        print('epoch {} , MSE : {}'.format(epoch,(error**2).mean()))
     
    # adoptive learning rate
    if epoch % 200 == 0:
        lr = lr * 0.1
        
    
    w_grad = 0 
    b_grad = 0
    
    w_grad = -1 * (X_train * error).mean()
    w = w - (lr * w_grad)
        
    b_grad = -1 * error.mean()
    b = b - (lr*b_grad)
              

test_error = y_test - ((w * X_test) + b)
test_mse = (test_error**2).mean() 
print('Final MSE : ',test_mse)
print('Final learning rate : ',lr)

In [None]:
y_pred = (w * X_test) + b
temp = pd.DataFrame({'test':y_test,'pred':y_pred})
temp.head()

In [None]:
temp['upper_range'] = temp['test'] * 1.2
temp['lower_range'] = temp['test'] * 0.8

temp[(temp['upper_range'] >=temp['pred']) & (temp['pred'] >= temp['lower_range'])].shape[0] * 100/temp.shape[0]