In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# Load the dataset
data = pd.read_csv("C:/Users/uday8/Documents/SEM_6/GIS/Project/IndianWeatherRepository.csv")

In [5]:
# Explore the dataset
print(data.head())  # Check the first few rows of the dataset
print(data.info())   # Check data types and missing values

  country location_name          region  latitude  longitude      timezone  \
0   India    Ashoknagar  Madhya Pradesh     24.57      77.72  Asia/Kolkata   
1   India        Raisen  Madhya Pradesh     23.33      77.80  Asia/Kolkata   
2   India    Chhindwara  Madhya Pradesh     22.07      78.93  Asia/Kolkata   
3   India         Betul  Madhya Pradesh     21.86      77.93  Asia/Kolkata   
4   India   Hoshangabad  Madhya Pradesh     22.75      77.72  Asia/Kolkata   

   last_updated_epoch      last_updated  temperature_celsius  \
0          1693286100  2023-08-29 10:45                 27.5   
1          1693286100  2023-08-29 10:45                 27.5   
2          1693286100  2023-08-29 10:45                 26.3   
3          1693286100  2023-08-29 10:45                 25.6   
4          1693286100  2023-08-29 10:45                 27.2   

   temperature_fahrenheit  ... air_quality_PM2.5  air_quality_PM10  \
0                    81.5  ...              12.6              18.5   
1     

In [6]:
# Drop irrelevant columns if any
data = data.drop(columns=['latitude','longitude','last_updated_epoch','last_updated', 'timezone','wind_mph','wind_kph','wind_degree','wind_direction','pressure_mb','pressure_mb','pressure_in',	'precip_mm',	'precip_in',	'humidity',	'cloud',	'feels_like_celsius',	'feels_like_fahrenheit',	'visibility_km',	'visibility_miles',	'uv_index',	'gust_mph',	'gust_kph',	'air_quality_Carbon_Monoxide',	'air_quality_Ozone',	'air_quality_Nitrogen_dioxide',	'air_quality_Sulphur_dioxide',	'air_quality_PM2.5',	'air_quality_PM10',	'air_quality_us-epa-index',	'air_quality_gb-defra-index',	'sunrise',	'sunset',	'moonrise',	'moonset',	'moon_phase',	'moon_illumination'
])

In [7]:
data

Unnamed: 0,country,location_name,region,temperature_celsius,temperature_fahrenheit,condition_text
0,India,Ashoknagar,Madhya Pradesh,27.5,81.5,Partly cloudy
1,India,Raisen,Madhya Pradesh,27.5,81.5,Sunny
2,India,Chhindwara,Madhya Pradesh,26.3,79.3,Partly cloudy
3,India,Betul,Madhya Pradesh,25.6,78.1,Cloudy
4,India,Hoshangabad,Madhya Pradesh,27.2,81.0,Cloudy
...,...,...,...,...,...,...
93163,India,Niwari,Uttar Pradesh,19.0,66.2,Mist
93164,India,Saitual,Mizoram,9.7,49.5,Clear
93165,India,Ranipet,Tamil Nadu,20.8,69.4,Clear
93166,India,Tenkasi,Tamil Nadu,29.0,84.2,Mist


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93168 entries, 0 to 93167
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   country                 93168 non-null  object 
 1   location_name           93168 non-null  object 
 2   region                  93168 non-null  object 
 3   temperature_celsius     93168 non-null  float64
 4   temperature_fahrenheit  93168 non-null  float64
 5   condition_text          93168 non-null  object 
dtypes: float64(2), object(4)
memory usage: 4.3+ MB


In [9]:
# Handle missing values if any
data.dropna(inplace=True)  # Remove rows with missing values
print(data)

      country location_name          region  temperature_celsius  \
0       India    Ashoknagar  Madhya Pradesh                 27.5   
1       India        Raisen  Madhya Pradesh                 27.5   
2       India    Chhindwara  Madhya Pradesh                 26.3   
3       India         Betul  Madhya Pradesh                 25.6   
4       India   Hoshangabad  Madhya Pradesh                 27.2   
...       ...           ...             ...                  ...   
93163   India        Niwari   Uttar Pradesh                 19.0   
93164   India       Saitual         Mizoram                  9.7   
93165   India       Ranipet      Tamil Nadu                 20.8   
93166   India       Tenkasi      Tamil Nadu                 29.0   
93167   India        Pendra     Maharashtra                 27.9   

       temperature_fahrenheit condition_text  
0                        81.5  Partly cloudy  
1                        81.5          Sunny  
2                        79.3  Partly clou

In [10]:
# Encode categorical variables
label_encoders = {}
categorical_columns = ['country', 'location_name', 'region', 'condition_text']
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

In [11]:
# Convert the dictionary to a DataFrame
data_df = pd.DataFrame(data)

# Define the numerical columns
numerical_columns = ['temperature_celsius', 'temperature_fahrenheit']

In [12]:
# Standardize the numerical columns
scaler=StandardScaler()
data_df[numerical_columns] = scaler.fit_transform(data_df[numerical_columns])

In [13]:
# Example data
data ={'condition_text': ['Partly cloudy', 'Sunny	', 'Cloudy', 'Mist', 'Patchy rain possible','Overcast', 'Patchy light rain with thunder']}
df = pd.DataFrame(data)

In [14]:
df

Unnamed: 0,condition_text
0,Partly cloudy
1,Sunny\t
2,Cloudy
3,Mist
4,Patchy rain possible
5,Overcast
6,Patchy light rain with thunder


In [15]:
# Perform one-hot encoding
one_hot_encoded = pd.get_dummies(df['condition_text'])

In [16]:
# Concatenate the one-hot encoded columns with the original dataframe
data_encoded = pd.concat([df, one_hot_encoded], axis=1)

print(data_encoded)

                   condition_text  Cloudy  Mist  Overcast  Partly cloudy  \
0                   Partly cloudy       0     0         0              1   
1                         Sunny\t       0     0         0              0   
2                          Cloudy       1     0         0              0   
3                            Mist       0     1         0              0   
4            Patchy rain possible       0     0         0              0   
5                        Overcast       0     0         1              0   
6  Patchy light rain with thunder       0     0         0              0   

   Patchy light rain with thunder  Patchy rain possible  Sunny\t  
0                               0                     0        0  
1                               0                     0        1  
2                               0                     0        0  
3                               0                     0        0  
4                               0                     1 

In [17]:
data_encoded.head()

Unnamed: 0,condition_text,Cloudy,Mist,Overcast,Partly cloudy,Patchy light rain with thunder,Patchy rain possible,Sunny\t
0,Partly cloudy,0,0,0,1,0,0,0
1,Sunny\t,0,0,0,0,0,0,1
2,Cloudy,1,0,0,0,0,0,0
3,Mist,0,1,0,0,0,0,0
4,Patchy rain possible,0,0,0,0,0,1,0


In [18]:
data_encoded.columns

Index(['condition_text', 'Cloudy', 'Mist', 'Overcast', 'Partly cloudy',
       'Patchy light rain with thunder', 'Patchy rain possible', 'Sunny\t'],
      dtype='object')

In [31]:
# Define features and target variable
features = ['condition_text', 'Cloudy', 'Mist', 'Overcast', 'Partly cloudy',
       'Patchy light rain with thunder', 'Patchy rain possible', 'Sunny\t']
target = 'Overcast'

X = data_encoded[features]
y = data_encoded[target]


In [32]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [33]:
# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


ValueError: could not convert string to float: 'Overcast'

In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()