In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Set the style for plots
sns.set(style="whitegrid")

# Step 1: Load the Dataset
# Load the bike rentals dataset
data_path = 'C:/Project/PhythonDataScience/dtsc_modul_4_homework/bikes_rent.csv'
df = pd.read_csv(data_path)

# Step 2: Initial Data Inspection
# Checking the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Checking the basic information of the dataset
print("\nDataset Info:")
print(df.info())

# Checking for missing values
print("\nMissing Values Count:")
print(df.isnull().sum())

# Step 3: Exploratory Data Analysis (EDA)
# Summary statistics of numerical columns
print("\nSummary Statistics:")
print(df.describe())

# Plotting a time series of bike rentals (assuming there is a 'date' and 'rentals' column)
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='date', y='rentals')  
plt.title('Bike Rentals Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Rentals')
plt.xticks(rotation=45)
plt.show()

# Plotting histograms for numerical features
df.hist(figsize=(12, 8))
plt.show()

# Step 4: Feature Engineering
# Example: Extracting new features from the data
df['hour'] = pd.to_datetime(df['date']).dt.hour
df['day_of_week'] = pd.to_datetime(df['date']).dt.day_name()

# Step 5: Correlation Analysis
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Step 6: Data Preprocessing
# Handling missing values (if any)
df.fillna(method='ffill', inplace=True)

# Encoding categorical features
df = pd.get_dummies(df, columns=['day_of_week'], drop_first=True)

# Step 7: Modeling
# Defining features (X) and target (y)
X = df[['hour', 'day_of_week_Monday', 'day_of_week_Tuesday', ...]] 
y = df['rentals'] 

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Building and training the model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 8: Model Evaluation
# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
print(f'R-squared Score: {r2_score(y_test, y_pred)}')

# Step 9: Analysis of Results
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Rentals')
plt.ylabel('Predicted Rentals')
plt.title('Actual vs Predicted Rentals')
plt.show()

# Step 10: Conclusion
# Summarize findings and conclusions based on the analysis and modeling.


ModuleNotFoundError: No module named 'sklearn'