In [None]:
# Import necessary libraries
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns  # For advanced visualizations
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.linear_model import LinearRegression  # For applying linear regression model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error  # For model evaluation metrics
import re  # For regular expressions

In [None]:
# Load the dataset (Boston housing prices)
# This dataset contains information about different housing attributes
boston = pd.read_csv('boston_house_prices.csv')


In [None]:
# Convert dataset into a Pandas DataFrame
df = pd.DataFrame(boston)

# Display the first five rows of the dataset
df.head()


In [None]:
# Column Information and Key Insights
# CRIM: Per capita crime rate by town (higher values indicate higher crime rate).
# ZN: Proportion of residential land zoned for large lots (higher values indicate larger homes).
# INDUS: Proportion of non-retail business acres per town (higher values indicate more industrial areas).
# CHAS: Charles River dummy variable (1 if near river, 0 otherwise).
# NOX: Nitric oxide concentration (higher values indicate more pollution).
# RM: Average number of rooms per dwelling (higher values indicate larger homes).
# AGE: Proportion of owner-occupied units built before 1940 (higher values indicate older houses).
# DIS: Weighted distance to five major employment centers (higher values indicate more distance from job centers).
# RAD: Index of accessibility to radial highways (higher values indicate better accessibility).
# TAX: Property tax rate per $10,000 (higher values indicate higher taxes).
# PTRATIO: Pupil-teacher ratio by town (lower values indicate better education quality).
# B: Proportion of the population that is Black (complex formula: 1000(Bk - 0.63)^2).
# LSTAT: Percentage of lower-status population (higher values indicate lower-income neighborhoods).
# MEDV: Median value of owner-occupied homes in $1000s (target variable).


In [None]:
# Add the target variable for easier access
df['PRICE'] = df['MEDV']
# Remove the old target variable column
df.drop(columns=['MEDV'], inplace=True)

In [None]:
# Display updated dataset
df.head()


In [None]:
# Check dataset shape (number of rows, columns)
df.shape

In [None]:
# Get column names
df.columns

In [None]:
# Display dataset information
df.info()

In [None]:
# Check unique values in each column
df.nunique()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Display dataset statistics
df.describe()

In [None]:
# Check correlation between variables
df.corr()

In [None]:
# Plot heatmap to visualize correlations
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

In [None]:
# Pairplot to visualize relationships
sns.pairplot(df, height=2)

In [None]:
# Boxplot to identify outliers
plt.figure(figsize=(20, 10))
sns.boxplot(data=df)

In [None]:
# Check price range
print("Minimum Price:", df.PRICE.min())
print("Maximum Price:", df.PRICE.max())

In [None]:
# Check standard deviation
print("Standard Deviation of Price:", df.PRICE.std())

In [None]:
# Export the cleaned dataset
df.to_csv('boston_dataset.csv', index=False)


In [None]:
# Machine Learning - Linear Regression
# Define feature variables (X) and target variable (y)
X = df.drop(columns=['PRICE'])
y = df['PRICE']

In [None]:
# Convert to numpy arrays
X = np.array(X)
y = np.array(y).reshape(-1, 1)

In [None]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print dataset split sizes
print("Training Data Size:", len(X_train))
print("Testing Data Size:", len(X_test))
print("Training Labels Size:", len(y_train))
print("Testing Labels Size:", len(y_test))

In [None]:
# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Display model intercept and coefficients
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

In [None]:
# Predict target values using test dataset
y_pred = model.predict(X_test)


In [None]:
# Evaluate model performance
print("R-squared Value:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

In [None]:
# Plot actual vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.plot([y_test.min(), y_test.max()], [y_pred.min(), y_pred.max()], color='red')
plt.title('Actual vs Predicted Prices')
plt.grid(True)
plt.show()

In [None]:
!pip install klib

In [None]:
import klib
from sklearn.datasets import load_diabetes

In [None]:
df = pd.read_csv('https://github.com/datasciencedojo/datasets/blob/master/titanic.csv?plain=1')

In [None]:
df.head()

In [None]:
# klib.describe - functions for visualizing datasets
- klib.cat_plot(df) # returns a visualization of the number and frequency of categorical features
- klib.corr_mat(df) # returns a color-encoded correlation matrix
- klib.corr_plot(df) # returns a color-encoded heatmap, ideal for correlations
- klib.corr_interactive_plot(df, split="neg").show() # returns an interactive correlation plot using plotly
- klib.dist_plot(df) # returns a distribution plot for every numeric feature
- klib.missingval_plot(df) # returns a figure containing information about missing values
