In [2]:
import subprocess
import sys


def install(package):
    subprocess.check_call([sys.executable,"-m","pip","install",(package)])

def freeze_requirements():
    try:
        with open('requirements.txt', 'w') as f:
            subprocess.check_call([sys.executable, "-m", "pip", "freeze"], stdout=f)
    except subprocess.CalledProcessError as e:
        print(f"Failed to freeze requirements: {e}")

In [None]:


packages = ["numpy","pandas","matplotlib","seaborn","scikit-learn"]

for pkg in packages:
    install(pkg)

freeze_requirements()

In [None]:
import pandas as pd
import numpy as np
import os

PROJECT_ROOT_DIR = "include/"
FOLDER_NAME = "CarPriceDataset_Final"
import_csv_through_url = 'https://raw.githubusercontent.com/Sudipta1975git/Car_Price_dataset/main/CarPriceDataset_Final.csv'
HOUSING_PATH = os.path.join(PROJECT_ROOT_DIR+"datasets",FOLDER_NAME)
os.makedirs(HOUSING_PATH, exist_ok=True)

car_price_url = PROJECT_ROOT_DIR+"datasets"+FOLDER_NAME+""
# df_url = pd.read_csv(import_csv_through_url)

# File path to save the downloaded CSV
local_csv_path = os.path.join(HOUSING_PATH, "CarPriceDataset_Final.csv")
df = pd.read_csv(import_csv_through_url)
df.to_csv(local_csv_path, index=False)

In [None]:
data_root = "https://github.com/ageron/data/raw/main/"
lifesat = pd.read_csv(data_root + "lifesat/lifesat.csv")
# lifesat.head()


In [None]:
X = lifesat[["GDP per capita (USD)"]].values
y = lifesat[["Life satisfaction"]].values

In [None]:
# X = pd.DataFrame([1,2,3,4,5,6])

# y = pd.DataFrame([11,12,13,14,15,16])

def mean(arr):
    length = len(arr)
    # print(length)
    sum = 0
    for a in range(0,length):
        sum = sum + arr[a]
    mean_val =  sum/length

    return mean_val

def variance(x):

    length = len(x)
    mean_value = mean(x)
    variance_sum = 0
    for a in range(0,length):
        variance_sum = variance_sum +(((x[a]) - mean_value)**2)
    variance_total =variance_sum/length
    return variance_total

def covariance(x, y):
    length = len(x)
    mean_x = mean(x)
    mean_y = mean(y)
    cov_sum = 0
    for i in range(0, length):
        cov_sum += (x[i] - mean_x) * (y[i] - mean_y)
    return cov_sum / length

def slope(x,y):
    return covariance(x,y)/variance(x)

def intercept(x,y,slope):
    return mean(y) - slope * mean(x)

def predict(x, slope, intercept):

    return slope * x + intercept

slope_value = slope(X,y)
intercept_value = intercept(X,y,slope_value)

print(f"Slope: {slope_value}")
print(f"Intercept: {intercept_value}")

new_val = 37_655.2

predictions = predict(new_val,slope_value,intercept_value)
print(f"Predictions: {predictions}")

In [None]:
data_root

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X,y)

new_val = np.array([[new_val]])
print(model.predict(new_val))


# Hands-On Machine Learning [Chapter: 2]

<h5>Housing Complex Datasheet</h5>

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "includes/"
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


<h3>Get the Data</h3>
<h5>Download the Data</h5>

In [None]:
import os
import tarfile
import urllib.request
from pathlib import Path
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join(PROJECT_ROOT_DIR+"datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    return pd.read_csv(Path(PROJECT_ROOT_DIR+"/datasets/housing/housing.csv"))

housing = fetch_housing_data()

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
import matplotlib.pyplot as plt

housing.hist(bins=50,figsize=(12,8))
plt.show()

In [None]:
import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc([test_indices])

In [None]:
from zlib import crc32

def is_id_in_test_set(identifier, test_ratio):
  return crc32(np.int64(identifier)) < test_ratio * 2**32

def split_data_with_id_hash(data, test_ratio, id_column):
  ids = data[id_column]
  in_test_set = ids.apply(lambda id: is_id_in_test_set(id, test_ratio))
  return data.loc[~in_test_set], data.loc[in_test_set]



In [None]:
housing_with_id = housing.reset_index()
train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, "index")

print("Training length: ",len(train_set))
print("Test length: ",len(test_set))

In [None]:
housing['income_cat'] = pd.cut(housing['median_income'],bins=[0.,1.5,3.0,4.5,6.,np.inf],labels=[1,2,3,4,5])
housing['income_cat'].value_counts().sort_index().plot.bar(rot=0,grid=True)
plt.xlabel("Income Category")
plt.ylabel("Number of Districts")
plt.show()

In [None]:
# from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

# splitter = StratifiedShuffleSplit(n_splits=10,test_size=0.2,random_state=42)
# strat_splits = []
# for train_index, test_index in splitter.split(housing,housing['income_cat']):
#     strat_train_set = housing.loc[train_index]
#     strat_test_set = housing.loc[test_index]
#     strat_splits.append((strat_train_set,strat_test_set))

In [None]:
strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2, stratify = housing['income_cat'],random_state=42)

In [None]:
strat_test_set['income_cat'].value_counts()/len(strat_test_set)

In [None]:
for set_ in (strat_test_set,strat_train_set):
  set_.drop('income_cat',axis=1,inplace=True)

# Create Data Visualization

In [None]:
housing = strat_train_set.copy()