<a href="https://colab.research.google.com/github/SURENDRAN-17/CMP7005-final/blob/main/CMP7005_surendran.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
import os

# Directory containing the dataset files
data_dir = "/mnt/data/PRSA_Data_extracted/PRSA_Data_20130301-20170228/"

# Load all the CSV files into a dictionary of dataframes
data_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
dataframes = {}

for file in data_files:
    file_path = os.path.join(data_dir, file)
    key = file.split('_')[2]  # Extract site name as key
    dataframes[key] = pd.read_csv(file_path)

# Combine all dataframes into one for analysis
combined_df = pd.concat(dataframes.values(), keys=dataframes.keys(), names=["Site", "Index"])
combined_df.reset_index(level=0, inplace=True)
combined_df.head()


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/PRSA_Data_extracted/PRSA_Data_20130301-20170228/'

In [None]:

# General Information about the dataset
print(combined_df.info())

# Checking for missing values
missing_values = combined_df.isnull().sum()

# Handling missing values (example: filling with mean)
combined_df.fillna(combined_df.mean(), inplace=True)

# Summary statistics
summary_stats = combined_df.describe()

# Visualization: Pairplot for relationships
import matplotlib.pyplot as plt
import seaborn as sns

sns.heatmap(combined_df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()


In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Select features and target for a simple regression model
features = ['PM2.5', 'PM10', 'NO2', 'CO', 'O3', 'SO2']
target = 'TEMP'

# Drop rows with missing target values
model_data = combined_df.dropna(subset=[target])
X = model_data[features]
y = model_data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


In [None]:

import tkinter as tk
from tkinter import ttk

def show_data_overview():
    overview_window = tk.Toplevel(root)
    overview_window.title("Data Overview")
    text = tk.Text(overview_window, wrap='word')
    text.insert('1.0', str(combined_df.describe()))
    text.pack(expand=True, fill='both')

# GUI Application
root = tk.Tk()
root.title("Air Quality Analysis")

main_frame = ttk.Frame(root)
main_frame.pack(fill='both', expand=True)

overview_button = ttk.Button(main_frame, text="Data Overview", command=show_data_overview)
overview_button.pack(pady=10)

root.mainloop()


In [None]:

# Ensure to commit your work using GitHub.
# Example Git commands:
# git add .
# git commit -m "Initial commit: Data analysis tasks 1 to 4 completed"
# git push origin main


In [2]:
import pandas as pd

# File paths
files = [
    'PRSA_Data_Dongsi_20130301-20170228.csv',
    'PRSA_Data_Guanyuan_20130301-20170228.csv',
    'PRSA_Data_Gucheng_20130301-20170228.csv',
    'PRSA_Data_Huairou_20130301-20170228.csv',
    'PRSA_Data_Nongzhanguan_20130301-20170228.csv',
    'PRSA_Data_Shunyi_20130301-20170228.csv'
]

# Load and merge datasets
dataframes = [pd.read_csv(file) for file in files]
combined_data = pd.concat(dataframes, ignore_index=True)

# Display the first few rows and information about the combined dataset
print(combined_data.head())
print(combined_data.info())


   No  year  month  day  hour  PM2.5  PM10  SO2   NO2     CO    O3  TEMP  \
0   1  2013      3    1     0    9.0   9.0  3.0  17.0  300.0  89.0  -0.5   
1   2  2013      3    1     1    4.0   4.0  3.0  16.0  300.0  88.0  -0.7   
2   3  2013      3    1     2    7.0   7.0  NaN  17.0  300.0  60.0  -1.2   
3   4  2013      3    1     3    3.0   3.0  5.0  18.0    NaN   NaN  -1.4   
4   5  2013      3    1     4    3.0   3.0  7.0   NaN  200.0  84.0  -1.9   

     PRES  DEWP  RAIN   wd  WSPM station  
0  1024.5 -21.4   0.0  NNW   5.7  Dongsi  
1  1025.1 -22.1   0.0   NW   3.9  Dongsi  
2  1025.3 -24.6   0.0  NNW   5.3  Dongsi  
3  1026.2 -25.5   0.0    N   4.9  Dongsi  
4  1027.1 -24.5   0.0  NNW   3.2  Dongsi  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210384 entries, 0 to 210383
Data columns (total 18 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   No       210384 non-null  int64  
 1   year     210384 non-null  int64  
 2   month    2103

In [3]:
# Load and merge datasets
dataframes = [pd.read_csv(file) for file in files]
combined_data = pd.concat(dataframes, ignore_index=True)

In [4]:
# Display the number of rows and columns
print("Number of rows:", combined_data.shape[0])
print("Number of columns:", combined_data.shape[1])

Number of rows: 210384
Number of columns: 18


In [5]:
print("\nData types of each column:")
print(combined_data.dtypes)



Data types of each column:
No           int64
year         int64
month        int64
day          int64
hour         int64
PM2.5      float64
PM10       float64
SO2        float64
NO2        float64
CO         float64
O3         float64
TEMP       float64
PRES       float64
DEWP       float64
RAIN       float64
wd          object
WSPM       float64
station     object
dtype: object


In [6]:
print("\nMissing values in each column:")
print(combined_data.isnull().sum())


Missing values in each column:
No             0
year           0
month          0
day            0
hour           0
PM2.5       4506
PM10        3128
SO2         4366
NO2         6624
CO         11157
O3          5712
TEMP         213
PRES         214
DEWP         218
RAIN         209
wd          1181
WSPM         177
station        0
dtype: int64


In [11]:

# Importing the datasets
data_dongsi = pd.read_csv('/content/PRSA_Data_Dongsi_20130301-20170228.csv')
data_guanyuan = pd.read_csv('/content/PRSA_Data_Guanyuan_20130301-20170228.csv')
data_gucheng = pd.read_csv('/content/PRSA_Data_Gucheng_20130301-20170228.csv')
data_huairou = pd.read_csv('/content/PRSA_Data_Huairou_20130301-20170228.csv')
data_nongzhanguan = pd.read_csv('/content/PRSA_Data_Nongzhanguan_20130301-20170228.csv')
data_shunyi = pd.read_csv('/content/PRSA_Data_Shunyi_20130301-20170228.csv')

In [12]:
# Merging the datasets
data_merged = pd.concat([data_dongsi, data_guanyuan, data_gucheng, data_huairou, data_nongzhanguan, data_shunyi], ignore_index=True)

In [13]:
# Removing Duplicate Entries
# This removes any row that is an exact duplicate of another
data_merged.drop_duplicates(inplace=True)

In [14]:
# Optionally, you can specify a subset of columns to consider for identifying duplicates
# data_merged.drop_duplicates(subset=['column1', 'column2'], inplace=True)

# Show how many records are in the dataset after duplicates are removed
print(f"Number of records after removing duplicates: {data_merged.shape[0]}")


Number of records after removing duplicates: 210384


In [15]:
# Display the first few rows to verify changes
print(data_merged.head())

   No  year  month  day  hour  PM2.5  PM10  SO2   NO2     CO    O3  TEMP  \
0   1  2013      3    1     0    9.0   9.0  3.0  17.0  300.0  89.0  -0.5   
1   2  2013      3    1     1    4.0   4.0  3.0  16.0  300.0  88.0  -0.7   
2   3  2013      3    1     2    7.0   7.0  NaN  17.0  300.0  60.0  -1.2   
3   4  2013      3    1     3    3.0   3.0  5.0  18.0    NaN   NaN  -1.4   
4   5  2013      3    1     4    3.0   3.0  7.0   NaN  200.0  84.0  -1.9   

     PRES  DEWP  RAIN   wd  WSPM station  
0  1024.5 -21.4   0.0  NNW   5.7  Dongsi  
1  1025.1 -22.1   0.0   NW   3.9  Dongsi  
2  1025.3 -24.6   0.0  NNW   5.3  Dongsi  
3  1026.2 -25.5   0.0    N   4.9  Dongsi  
4  1027.1 -24.5   0.0  NNW   3.2  Dongsi  


In [18]:
print(data_merged.columns)

Index(['No', 'year', 'month', 'day', 'hour', 'PM2.5', 'PM10', 'SO2', 'NO2',
       'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'wd', 'WSPM', 'station'],
      dtype='object')
