In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn import metrics
from typing import List

%matplotlib inline

In [18]:
house_df = pd.read_csv("clean_house_data.csv")
# df = house_df.loc[:, house_df.isnull().mean() < .8]
# df
house_df.columns

Index(['id', 'Zip', 'Price', 'Price of square meter', 'Building Cond. values',
       'Number of facades', 'Number of rooms', 'Living area', 'Furnished',
       'Kitchen values', 'Surface of the land', 'Primary energy consumption',
       'Energy_classes', 'Terrace', 'Terrace surface', 'Garden',
       'Garden surface', 'Open fire', 'Swimming pool'],
      dtype='object')

In [19]:
# Remove outliers
def remove_outliers(df: pd.DataFrame, columns: List[str], n_std: int) -> pd.DataFrame:
    for col in columns:
        print('Working on column: {}'.format(col))
        
        mean = df[col].mean()
        sd = df[col].std()
        
        df = df[(df[col] <= mean+(n_std*sd))]
        
    return df
no_outliers_df = remove_outliers

In [20]:
# Identify and replace the relevant elements in the following lines of code.

# Delete these comments when done.

# Stop modifications when reached 'End of modifications'

HouseDF = pd.read_csv('House.csv')

sns.pairplot(HouseDF)

X = HouseDF[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
               'Avg. Area Number of Bedrooms', 'Area Population']]

y = HouseDF['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)

lm = LinearRegression() 

lm.fit(X_train,y_train) 

# Now let’s evaluate the model by checking out its coefficients 
# and how we can interpret them.
print(lm.intercept_)

coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])

"""
Output : 

Avg. Area Income : 21.528
Avg. Area House Age : 164883.28
Avg. Area Number of Rooms : 122368.67
Avg. Area Number of Bedrooms : 2233.80
Area Population : 15.15

What does coefficient of data says:

    Holding all other features fixed, a 1 unit increase in Avg. Area Income 
    is associated with an increase of $21.52 .
    Holding all other features fixed, a 1 unit increase in Avg. Area House Age 
    is associated with an increase of $164883.28 .
    Holding all other features fixed, a 1 unit increase in Avg. Area Number of Rooms 
    is associated with an increase of $122368.67 .
    Holding all other features fixed, a 1 unit increase in Avg. Area Number of Bedrooms 
    is associated with an increase of $2233.80 .
    Holding all other features fixed, a 1 unit increase in Area Population 
    is associated with an increase of $15.15 .
"""

predictions = lm.predict(X_test)  

plt.scatter(y_test,predictions)
# Good prediction would appear as a line.

sns.distplot((y_test-predictions),bins=50);
# If the data is displayed in a bell shape, we did well

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions)) 
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions))) 

# End of modifications

"""
Below is an example of how a sql based regression works
# Connect to the database
connection = pymysql.connect(
    host='localhost',
    user='username',
    password='password',
    database='database_name'
)

# Read data into a Pandas DataFrame
df = pd.read_sql('SELECT * FROM table_name', con=connection)

# Prepare the data
X = df[['feature_1', 'feature_2']]
y = df['price']

# Create a logistic regression model
model = LogisticRegression()

# Train the model
model.fit(X, y)

# Close the connection
connection.close()

"""

FileNotFoundError: [Errno 2] No such file or directory: 'House.csv'