In [None]:
#Name:Aisha Amenhali
#Date:6-4-2025
# Section 1- Weather Predication model and Data Analysis for temperature

In [None]:
pip install --upgrade scikit-learn

In [None]:
#import libraries and packages
import numpy as np
import pandas as pd
from scipy.stats import mode
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score
import joblib
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_iris 
from datetime import datetime

In [None]:
#Step 1 -> import the csv file and delete that first column
data = pd.read_csv("Dataset_temperature.csv", encoding='cp1252')
data
print(data)

In [None]:
# Data Cleaning

In [None]:
#check whether the dataset is balanced or not
weather_counts = data["Weather Category"].value_counts()
temp_data = pd.DataFrame({
    "weather condition":weather_counts.index,
    "Counts":weather_counts.values
})

plt.figure(figsize = (18,8))
sns.barplot(x="weather condition", y="Counts", data = temp_data)
plt.xticks(rotation=90)
plt.show()

In [None]:
data = data.drop('Unnamed: 7', axis=1)

In [None]:
# check the dataset first few rows
print(data.head())

In [None]:
print(data.info())

In [None]:
print(data.describe())

In [None]:
# Handle Outliers: Check for outliers in numerical columns
plt.figure(figsize=(10,6))
sns.boxplot(data=data[["Temp_Max[?øC]","Temp_Min[?øC]","Temp_Mean[?øC]"]])
plt.show()

In [None]:
# Remove Outliers Using IQR Method
Q1 = data[["Temp_Max[?øC]","Temp_Min[?øC]","Temp_Mean[?øC]"]].quantile(0.25)
Q3 = data[["Temp_Max[?øC]","Temp_Min[?øC]","Temp_Mean[?øC]"]].quantile(0.75)
IQR = Q3-Q1

low = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR

data_cleaned = data[~((data<low) | (data > upper)).any(axis=1)]

print(data_cleaned)

In [None]:
#Verify the Dataset After Outlier Removal
plt.figure(figsize=(10,6))
sns.boxplot(data=data_cleaned[["Temp_Max[?øC]","Temp_Min[?øC]","Temp_Mean[?øC]","Weather Category","Emirate"]])
plt.title("Box Plot After Outlier Removal")
print(plt.show())

In [None]:
# Clip extreme instead extreme values it's not important to remove data
data[["Temp_Max[?øC]","Temp_Min[?øC]","Temp_Mean[?øC]"]] = data[["Temp_Max[?øC]","Temp_Min[?øC]","Temp_Mean[?øC]"]].clip(low,upper,axis=1)


In [None]:
#Data Visualization 

In [None]:
#Scatter plot: to identify the relationship between two variables Temp_Min[°C] and Temp_Max[°C]
plt.scatter(data["Temp_Max[?øC]"],data["Temp_Min[?øC]"])
plt.title("Scatter Plot")
plt.xlabel("Temperature Maximum")
plt.ylabel("Temperature Minimum")
plt.colorbar()
print(plt.show())

In [None]:
#Line Chart Two show the relationship between the weather condition and Temp_Mean[°C]
plt.plot(data["Weather Category"])
plt.plot(data["Temp_Mean[?øC]"])
plt.title("Line Chart")
plt.xlabel('Weather condition')
plt.ylabel("Average Temperature")
print(plt.show())

In [None]:
#Multivariate Analysis (Comapring Multiple Variables) a pairplot
sns.pairplot(data[["Temp_Max[?øC]","Temp_Min[?øC]","Temp_Mean[?øC]"]])
plt.show()

In [None]:
# Look for corretions among the features
cols = ["Year","Month","Day","Temp_Max[?øC]","Temp_Min[?øC]","Temp_Mean[?øC]","Weather Category","Emirate"]
cor_matrix = data[cols].corr()
cor_matrix

In [None]:
# Correlation Heatmap
plt.figure(figsize=(11,7))
sns.heatmap(data.corr(), annot=True, cmap='spring',linewidth=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Correcting the typo and ensuring correct usage
plt.pie(data['Weather Category'].value_counts().values,  # Fix: value_counts() instead of values_counts()
        labels=data['Weather Category'].value_counts().index,
        autopct='%1.1f%%',
        colors=['lightblue', 'lightgreen', 'pink', 'red', 'purple'],  # Optional: Add colors
        startangle=90,  # Rotate for better visibility
        wedgeprops={'edgecolor': 'black'})  # Add borders for better clarity

plt.title('Distribution of Weather Categories')
plt.show()

In [None]:
data['Weather Category'].value_counts()

In [None]:
data.dtypes

In [None]:
# Step 4: Bulid the predictive model for weather prediction

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Weather Category'] = le.fit_transform(data['Weather Category'])

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Emirate'] = le.fit_transform(data['Emirate'])

In [None]:
#Create a MinMaxScaler instance for Feature Scaling
from sklearn.preprocessing import MinMaxScaler
X = data[["Temp_Max[?øC]","Temp_Min[?øC]","Temp_Mean[?øC]"]]

scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(X)

X_scaled_d= pd.DataFrame(X_scaled, columns=X.columns)

print(X_scaled_d.head())

In [None]:
#Splitting the data for training and testing model
#We split the dataset into 80% training and 20% testing:
X = data.iloc[:,:-1]
Y = data.iloc[:, -1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"Y_train: {Y_train.shape}, Y_test: {Y_test.shape}")

In [None]:
#Splitting the data for training and testing model
#We split the dataset into 80% training and 20% testing:
X = data.iloc[:,:-1]
Y = data.iloc[:, -1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
print(f"Train:{X_train.shape},{Y_train.shape}")
print(f"Train:{X_test.shape},{Y_test.shape}")

In [None]:
#Check if the columns exist
print(X_train.columns)

In [None]:
print(np.isnan(X_train).sum())  # Should be 0
print(np.isinf(X_train).sum())  # Should be 0

In [None]:
# Ensure no data loss during splitting
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y if len(set(Y)) > 1 else None
)

print(X_train.shape)  # Should be non-zero
print(Y_train.shape)  # Should be non-zero


In [None]:
print("X_train shape:", X_train.shape)
print("Sample X_train rows:\n", X_train.head())

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [None]:
#Using K-Fold Cross-Validation for model Selection
#Model Buliding

X = data.iloc[:,:-1]
y = data.iloc[:, -1]

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Apply Label Encoding
le = LabelEncoder()
for col in categorical_cols:
    X[col] = le.fit_transform(X[col])
    
X = pd.get_dummies(X, drop_first=True)
y = le.fit_transform(y)  # Convert target variable to numbers

print(X.dtypes)
print(X.head())  # Check first few rows

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
X_selected = SelectKBest(f_classif, k=10).fit_transform(X_scaled, y)

In [None]:
#Using K-Fold Cross-Validation for model Selection
#Model Buliding

In [None]:
X=data[["Year","Month","Day","Temp_Max[?øC]","Temp_Mean[?øC]","Temp_Min[?øC]","Emirate"]]
y=data["Weather Category"]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [None]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_resampled)

In [None]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Model 1: SVC
svc_params = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.1, 0.01],
    'kernel': ['rbf', 'poly']
}
svc_grid = GridSearchCV(SVC(), svc_params, cv=cv)
svc_grid.fit(X_pca, y_resampled)
svc_score = cross_val_score(svc_grid.best_estimator_, X_pca, y_resampled, cv=cv).mean()

# Model 2: GaussianNB
nb_model = GaussianNB()
nb_score = cross_val_score(nb_model, X_pca, y_resampled, cv=cv).mean()

# Model 3: Random Forest
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'max_features': ['sqrt', 'log2']
}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=cv)
rf_grid.fit(X_pca, y_resampled)
rf_score = cross_val_score(rf_grid.best_estimator_, X_pca, y_resampled, cv=cv).mean()

# Print all results
print("============================================================")
print("Best SVC Score:", svc_score)
print("Best GaussianNB Score:", nb_score)
print("Best Random Forest Score:", rf_score)
print("============================================================")

In [None]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=18)

# Fit the model to the training data
rf_model.fit(X_train, Y_train)

In [None]:
# Model predication
y_pred_R=rf_model.predict(X_test)

In [None]:
# Model Evaluation 
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix 
# The accuracy for our model
accuracy_score(Y_test,y_pred_R)

In [None]:
X = data[["Temp_Max[?øC]","Temp_Min[?øC]","Temp_Mean[?øC]"]] 
y = data['Weather Category'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

svm_model = SVC()
svm_model.fit(X_train, y_train)
preds = svm_model.predict(X_test)

print(f"Accuracy on train data by SVM Classifier: {accuracy_score(y_train, svm_model.predict(X_train)) * 100}")
print(f"Accuracy on test data by SVM Classifier: {accuracy_score(y_test, preds) * 100}")

cf_matrix = confusion_matrix(y_test, preds)
plt.figure(figsize=(12, 8))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confusion Matrix for SVM Classifier on Test Data")
plt.show()

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
preds = nb_model.predict(X_test)

print(f"Accuracy on train data by Naive Bayes Classifier: {accuracy_score(y_train, nb_model.predict(X_train)) * 100}")
print(f"Accuracy on test data by Naive Bayes Classifier: {accuracy_score(y_test, preds) * 100}")

cf_matrix = confusion_matrix(y_test, preds)
plt.figure(figsize=(12, 8))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confusion Matrix for Naive Bayes Classifier on Test Data")
plt.show()

rf_model = RandomForestClassifier(random_state=18)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

print(f"Accuracy on train data by Random Forest Classifier: {accuracy_score(y_train, rf_model.predict(X_train)) * 100}")
print(f"Accuracy on test data by Random Forest Classifier: {accuracy_score(y_test, preds) * 100}")

cf_matrix = confusion_matrix(y_test, preds)
plt.figure(figsize=(12, 8))
sns.heatmap(cf_matrix, annot=True)
plt.title("Confusion Matrix for Random Forest Classifier on Test Data")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Cross-validation scores
cv_scores = cross_val_score(rf_model, X_train, Y_train, cv=10, n_jobs=-1)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy:.2f}")

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)