In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(r"C:\Users\Sachini\Downloads\seoul+bike+sharing+demand\SeoulBikeData.csv", sep=',', encoding='ISO-8859-1')

In [None]:
# Display basic info
print("Dataset Overview:\n", df.info())

In [None]:
df

In [None]:
# Summary statistics
print("\nSummary Statistics:\n", df.describe())

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

In [None]:
df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image, display

# Identify numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Generate numerical summary statistics
numerical_summary = df[numerical_cols].describe().round(2)

# Function to create and save the numerical summary table as an image
def save_numerical_summary_as_image(summary_df, filename, title):
    fig, ax = plt.subplots(figsize=(30, min(10, len(summary_df) * 0.7))) 

    # Hide axes
    ax.axis('tight')
    ax.axis('off')

    # Create the table with styling
    table = ax.table(cellText=summary_df.values, colLabels=summary_df.columns, 
                     rowLabels=summary_df.index, loc='center', cellLoc='center', colLoc='center',
                     colColours=["#f4f4f4"] * len(summary_df.columns),  # Light gray header background
                     rowColours=["#e6e6e6"] * len(summary_df))  # Alternate row shading

    # Adjust font size
    table.auto_set_font_size(False)
    table.set_fontsize(10)

    # Set the title
    ax.set_title(title, fontsize=12, fontweight="bold", pad=10)

    # Save as PNG
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    plt.close()

# Save and display numerical summary
save_numerical_summary_as_image(numerical_summary, 'numerical_summary.png', "Numerical Summary Statistics")
display(Image('numerical_summary.png'))

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Generate categorical summary statistics
categorical_summary = df[categorical_cols].describe().T  # Transposed for better readability

# Function to create and save the categorical summary table as an image
def save_categorical_summary_as_image(summary_df, filename, title):
    fig, ax = plt.subplots(figsize=(10, min(10, len(summary_df) * 0.7)))  # Adjust height dynamically

    # Hide axes
    ax.axis('tight')
    ax.axis('off')

    # Create the table with styling
    table = ax.table(cellText=summary_df.values, colLabels=summary_df.columns, 
                     rowLabels=summary_df.index, loc='center', cellLoc='center', colLoc='center',
                     colColours=["#f4f4f4"] * len(summary_df.columns),  # Light gray header background
                     rowColours=["#e6e6e6"] * len(summary_df))  # Alternate row shading

    # Adjust font size
    table.auto_set_font_size(False)
    table.set_fontsize(10)

    # Set the title
    ax.set_title(title, fontsize=12, fontweight="bold", pad=10)

    # Save as PNG
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    plt.close()

# Save and display categorical summary
save_categorical_summary_as_image(categorical_summary, 'categorical_summary.png', "Categorical Summary Statistics")
display(Image('categorical_summary.png'))


In [None]:
import seaborn as sns

# List of quantitative variables
quant_vars = ['Rented Bike Count', 'Temperature(°C)', 'Humidity(%)', 'Wind speed (m/s)', 
              'Visibility (10m)', 'Dew point temperature(°C)', 'Solar Radiation (MJ/m2)']

# Create box plots
plt.figure(figsize=(12, 8))
for i, var in enumerate(quant_vars, 1):
    plt.subplot(3, 3, i)  # Arrange plots in a grid
    sns.boxplot(y=df[var])
    plt.title(var)

plt.tight_layout()

# Save the figure
plt.savefig("box_plots.png", dpi=300)  # Save with high resolution
plt.show()

In [None]:
# Identify categorical (qualitative) variables
categorical_vars = df.select_dtypes(include=['object']).columns

# Set the style
sns.set_style("whitegrid")

# Create subplots in one row
fig, axes = plt.subplots(nrows=1, ncols=len(categorical_vars), figsize=(5 * len(categorical_vars), 5))

# Plot count plots for each categorical variable
for i, col in enumerate(categorical_vars):
    sns.countplot(x=df[col], ax=axes[i], palette="Set2")
    axes[i].set_title(f"Distribution of {col}")
    axes[i].set_xlabel(col)
    axes[i].set_ylabel("Count")
    axes[i].tick_params(axis='x', rotation=45)  # Rotate labels if needed

# Adjust layout
plt.tight_layout()

# Save the figure
plt.savefig("categorical_univariate_analysis.png")

# Show the plots
plt.show()

In [None]:
# Group the data by 'Hour' and calculate the mean 'Rented Bike count' for each hour
hourly_bike_count = df.groupby('Hour')['Rented Bike Count'].mean()

# Plot the data
plt.figure(figsize=(10, 6))
plt.plot(hourly_bike_count.index, hourly_bike_count.values, marker='o')
plt.title('Hourly Rented Bike Count', fontsize=16)
plt.xlabel('Hour of the Day', fontsize=12)
plt.ylabel('Average Rented Bike Count', fontsize=12)
plt.grid(True)
plt.xticks(range(0, 24))

# Save the figure
plt.savefig("hourly_rented_bike_count.png")

plt.show()

In [None]:
# Extract month from the 'Date' column
df['Month'] = df['Date'].dt.month

# Map month numbers to month names
month_names = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
    7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'
}
df['Month_Name'] = df['Month'].map(month_names)

# Group the data by 'Month_Name' and calculate the mean 'Rented Bike count'
monthly_bike_count = df.groupby('Month_Name')['Rented Bike Count'].mean()

# Reorder months in chronological order
monthly_bike_count = monthly_bike_count[month_names.values()]

# Plot the data
plt.figure(figsize=(10, 6))
monthly_bike_count.plot(kind='line', marker='o', linestyle='-', color='b')
plt.title('Average Rented Bike Count by Month', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Average Rented Bike Count', fontsize=12)
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()

# Save the figure
plt.savefig("average_rented_bike_count_by_month.png")

plt.show()

In [None]:
# Set up the plot
plt.figure(figsize=(10, 6))

# Calculate the mean 'Rented Bike Count' for each season
season_avg_bike_count = df.groupby('Seasons')['Rented Bike Count'].mean()

# Plotting the line chart for 'Seasons' vs 'Rented Bike Count'
season_avg_bike_count.plot(kind='line', marker='o', color='b', linewidth=2)

# Adding title and labels
plt.title('Line Chart: Seasons vs Rented Bike Count', fontsize=16)
plt.xlabel('Seasons', fontsize=12)
plt.ylabel('Rented Bike Count', fontsize=12)

# Set the x-axis ticks to be the season names
plt.xticks(range(len(season_avg_bike_count)), season_avg_bike_count.index, rotation=45)

# Adjust layout manually to avoid margin issues
plt.tight_layout()

# Save the plot to a file
plt.savefig('line_chart_seasons_vs_bike_count.png', dpi=300)

# Show the plot
plt.show()

In [None]:
# Convert Hour to a categorical variable (Morning, Afternoon, Evening)
def categorize_hour(hour):
    if 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    else:
        return 'Evening'

# Apply the function to create a new 'Time of Day' column
df['Time of Day'] = df['Hour'].apply(categorize_hour)

In [None]:
df

In [None]:
# Create a FacetGrid to plot separate scatterplots for each Time of Day
g = sns.FacetGrid(df, col="Time of Day", hue="Time of Day", palette="Set2", height=5, aspect=1.2)
g.map(sns.scatterplot, 'Rainfall(mm)', 'Rented Bike Count')

# Add titles and labels to the facets
g.set_axis_labels('Rainfall (mm)', 'Rented Bike Count')
g.set_titles("{col_name} Time of Day")

# Add a legend
g.add_legend()

# Add a main title
g.fig.suptitle('Impact of Rainfall on Bike Rentals Across Different Times of Day', fontsize=16)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.subplots_adjust(top=0.9)  # To make space for the suptitle

# Save the plot to a file
plt.savefig('impact_of_rainfall.png', dpi=300)

# Show the plot
plt.show()

In [None]:
# Create a FacetGrid to plot separate scatterplots for each Time of Day with Snowfall
g_snowfall = sns.FacetGrid(df, col="Time of Day", hue="Time of Day", palette="Set2", height=5, aspect=1.2)
g_snowfall.map(sns.scatterplot, 'Snowfall (cm)', 'Rented Bike Count')

# Add titles and labels to the facets
g_snowfall.set_axis_labels('Snowfall (cm)', 'Rented Bike Count')
g_snowfall.set_titles("{col_name} Time of Day")

# Add a legend
g_snowfall.add_legend()

# Add a main title
g_snowfall.fig.suptitle('Impact of Snowfall on Bike Rentals Across Different Times of Day', fontsize=16)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.subplots_adjust(top=0.9)  # To make space for the suptitle

# Save the plot to a file
plt.savefig('impact_of_snowfall.png', dpi=300)

# Show the plot
plt.show()

In [None]:
print("Dataset Overview:\n", df.info())

In [None]:
df

In [None]:
from sklearn.preprocessing import StandardScaler

df_encoded = df.copy()  # Create a copy to avoid modifying the original dataframe

# Label Encode binary categorical variables
binary_cols = ['Holiday', 'Functioning Day']
for col in binary_cols:
    df_encoded[col] = df_encoded[col].map({'No Holiday': 0, 'Holiday': 1, 'No': 0, 'Yes': 1})

# One-Hot Encode nominal categorical variables
df_encoded = pd.get_dummies(df_encoded, columns=['Seasons', 'Time of Day'], drop_first=True)

In [None]:
df_encoded

In [None]:
print("Dataset Overview:\n", df_encoded.info())

In [None]:
# Exclude Month and Hour from normalization
exclude_cols = ['Month', 'Hour', 'Holiday', 'Functioning Day']  # Add other categorical columns if needed
numeric_cols = df_encoded.select_dtypes(include=['int64', 'float64']).columns
numeric_cols = numeric_cols.difference(exclude_cols)  # Remove Month & Hour

# Normalize numerical features
scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

In [None]:
df

In [None]:
df_encoded

In [None]:
# Save the preprocessed dataset to a CSV file
df_encoded.to_csv("seoul_bike_sharing_processed.csv", index=False)

In [None]:
print("Dataset Overview:\n", df_encoded.info())

In [None]:
# Exclude the specified columns (Month, Hour, Holiday, and Functioning Day) for correlation analysis
exclude_cols = ['Month', 'Hour', 'Holiday', 'Functioning Day']  # Columns to exclude
numeric_cols = df_encoded.select_dtypes(include=['int64', 'float64']).columns
numeric_cols = numeric_cols.difference(exclude_cols)  # Remove the excluded columns from numeric columns

# Select only the numerical columns
df_corr = df_encoded[numeric_cols]

# Calculate the correlation matrix
corr_matrix = df_corr.corr()

# Display the correlation matrix
# Set figure size
plt.figure(figsize=(10, 8))

# Plot the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45, ha='right')

# Adjust layout to prevent labels from getting cut off
plt.tight_layout()

# Save the plot
plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')

# Show the plot
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

# Define the target (bike count) and features (weather, time, etc.)
X = df_encoded[['Temperature(°C)', 'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)', 
                'Dew point temperature(°C)', 'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 
                'Snowfall (cm)', 'Hour', 'Holiday', 'Functioning Day']]
y = df_encoded['Rented Bike Count']

# Fit the regression model
model = LinearRegression()
model.fit(X, y)

# Get predictions
y_pred = model.predict(X)

# Create subplots for each variable's regression plot
features = X.columns
num_features = len(features)

# Calculate the number of rows and columns dynamically
ncols = 3
nrows = (num_features // ncols) + (num_features % ncols > 0)

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 5 * nrows))

# Flatten axes for easier iteration
axes = axes.flatten()

# Create the plots
for i, feature in enumerate(features):
    sns.regplot(x=feature, y='Rented Bike Count', data=df_encoded, ax=axes[i], line_kws={'color': 'red'})
    axes[i].set_title(f'Linear Regression: {feature} vs Bike Rentals')

# Remove any empty axes if the number of features is not a perfect multiple of ncols
for i in range(num_features, len(axes)):
    fig.delaxes(axes[i])

# Adjust layout for better spacing
plt.tight_layout()

# Save the plot to a file
plt.savefig('linear_regression_all_features.png', dpi=300)

# Show the plot
plt.show()