<a href="https://colab.research.google.com/github/SURENDRAN-17/CMP7005-final/blob/main/CMP7005_surendran.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
import os

# Directory containing the dataset files
data_dir = "/mnt/data/PRSA_Data_extracted/PRSA_Data_20130301-20170228/"

# Load all the CSV files into a dictionary of dataframes
data_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
dataframes = {}

for file in data_files:
    file_path = os.path.join(data_dir, file)
    key = file.split('_')[2]  # Extract site name as key
    dataframes[key] = pd.read_csv(file_path)

# Combine all dataframes into one for analysis
combined_df = pd.concat(dataframes.values(), keys=dataframes.keys(), names=["Site", "Index"])
combined_df.reset_index(level=0, inplace=True)
combined_df.head()


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/PRSA_Data_extracted/PRSA_Data_20130301-20170228/'

In [None]:

# General Information about the dataset
print(combined_df.info())

# Checking for missing values
missing_values = combined_df.isnull().sum()

# Handling missing values (example: filling with mean)
combined_df.fillna(combined_df.mean(), inplace=True)

# Summary statistics
summary_stats = combined_df.describe()

# Visualization: Pairplot for relationships
import matplotlib.pyplot as plt
import seaborn as sns

sns.heatmap(combined_df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()


In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Select features and target for a simple regression model
features = ['PM2.5', 'PM10', 'NO2', 'CO', 'O3', 'SO2']
target = 'TEMP'

# Drop rows with missing target values
model_data = combined_df.dropna(subset=[target])
X = model_data[features]
y = model_data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


In [None]:

import tkinter as tk
from tkinter import ttk

def show_data_overview():
    overview_window = tk.Toplevel(root)
    overview_window.title("Data Overview")
    text = tk.Text(overview_window, wrap='word')
    text.insert('1.0', str(combined_df.describe()))
    text.pack(expand=True, fill='both')

# GUI Application
root = tk.Tk()
root.title("Air Quality Analysis")

main_frame = ttk.Frame(root)
main_frame.pack(fill='both', expand=True)

overview_button = ttk.Button(main_frame, text="Data Overview", command=show_data_overview)
overview_button.pack(pady=10)

root.mainloop()


In [None]:

# Ensure to commit your work using GitHub.
# Example Git commands:
# git add .
# git commit -m "Initial commit: Data analysis tasks 1 to 4 completed"
# git push origin main
