## Objective:
- Analyzing historical climate data to understand trends and predict future climate patterns in Tanzania

In [4]:
#import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


### 2. Data Preprocessing:
Tasks:

- Handle missing values (if any).

- Convert data types as necessary (e.g., datetime conversion).

- Feature engineering: Extract relevant features such as seasonal trends, average temperatures, precipitation levels.

- Encoding categorical variables (if applicable).

- Output: Cleaned dataset ready for exploratory data analysis (EDA) and modeling.

In [5]:
climatedf = pd.read_csv("data/tanzania_climate_data.csv")
climatedf.head()

Unnamed: 0,Year,Month,Average_Temperature_C,Total_Rainfall_mm,Max_Temperature_C,Min_Temperature_C
0,2000,1,26.1,19.8,32.0,21.9
1,2000,2,25.8,87.3,29.5,22.7
2,2000,3,26.8,266.5,29.9,21.8
3,2000,4,26.3,136.7,30.1,22.9
4,2000,5,26.0,63.1,30.7,22.4


In [6]:
climatedf.shape

(252, 6)

In [7]:
climatedf.info()   #no missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   252 non-null    int64  
 1   Month                  252 non-null    int64  
 2   Average_Temperature_C  252 non-null    float64
 3   Total_Rainfall_mm      252 non-null    float64
 4   Max_Temperature_C      252 non-null    float64
 5   Min_Temperature_C      252 non-null    float64
dtypes: float64(4), int64(2)
memory usage: 11.9 KB


In [8]:
climatedf.isnull().sum() #no missing values

Year                     0
Month                    0
Average_Temperature_C    0
Total_Rainfall_mm        0
Max_Temperature_C        0
Min_Temperature_C        0
dtype: int64

In [9]:
climatedf['date']= pd.to_datetime(climatedf[['Year', 'Month']].assign(day=1))

In [10]:
climatedf.head()

Unnamed: 0,Year,Month,Average_Temperature_C,Total_Rainfall_mm,Max_Temperature_C,Min_Temperature_C,date
0,2000,1,26.1,19.8,32.0,21.9,2000-01-01
1,2000,2,25.8,87.3,29.5,22.7,2000-02-01
2,2000,3,26.8,266.5,29.9,21.8,2000-03-01
3,2000,4,26.3,136.7,30.1,22.9,2000-04-01
4,2000,5,26.0,63.1,30.7,22.4,2000-05-01


In [11]:
climatedf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Year                   252 non-null    int64         
 1   Month                  252 non-null    int64         
 2   Average_Temperature_C  252 non-null    float64       
 3   Total_Rainfall_mm      252 non-null    float64       
 4   Max_Temperature_C      252 non-null    float64       
 5   Min_Temperature_C      252 non-null    float64       
 6   date                   252 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 13.9 KB


In [16]:
climatedf.drop(['Year','Month'], axis=1, inplace=True)

In [17]:
climatedf.head()

Unnamed: 0,Average_Temperature_C,Total_Rainfall_mm,Max_Temperature_C,Min_Temperature_C,date
0,26.1,19.8,32.0,21.9,2000-01-01
1,25.8,87.3,29.5,22.7,2000-02-01
2,26.8,266.5,29.9,21.8,2000-03-01
3,26.3,136.7,30.1,22.9,2000-04-01
4,26.0,63.1,30.7,22.4,2000-05-01


In [19]:
climatedf.to_csv("data/cleanData.csv",index=False)