In [100]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [101]:
#Load the dataset into a data frame using Python.
df = pd.read_csv('Africa_climate_change.csv')

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 464815 entries, 0 to 464814
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   DATE     464815 non-null  object 
 1   PRCP     177575 non-null  float64
 2   TAVG     458439 non-null  float64
 3   TMAX     363901 non-null  float64
 4   TMIN     332757 non-null  float64
 5   COUNTRY  464815 non-null  object 
dtypes: float64(4), object(2)
memory usage: 21.3+ MB


In [103]:
#Exploring dataset for missing values
print(df.isnull().sum())

DATE            0
PRCP       287240
TAVG         6376
TMAX       100914
TMIN       132058
COUNTRY         0
dtype: int64


In [104]:
df.describe()

Unnamed: 0,PRCP,TAVG,TMAX,TMIN
count,177575.0,458439.0,363901.0,332757.0
mean,0.120941,77.029838,88.713969,65.548262
std,0.486208,11.523634,13.042631,11.536547
min,0.0,-49.0,41.0,12.0
25%,0.0,70.0,81.0,58.0
50%,0.0,80.0,90.0,68.0
75%,0.01,85.0,99.0,74.0
max,19.69,110.0,123.0,97.0


In [105]:
#Determine the correlation between columns with missing values
correlation_matrix = df[['PRCP', 'TAVG', 'TMAX', 'TMIN']].corr()

In [106]:
print(correlation_matrix)

          PRCP      TAVG      TMAX      TMIN
PRCP  1.000000  0.058191  0.012396  0.122140
TAVG  0.058191  1.000000  0.943893  0.913932
TMAX  0.012396  0.943893  1.000000  0.790032
TMIN  0.122140  0.913932  0.790032  1.000000


In [None]:
df.head(20)

In [108]:
#Sensei please ignore this cell
# 1. Drop columns with a high percentage of missing values (if applicable)
#threshold = 0.4 For example, if 40% or more data is missing, drop the column
#df = df.dropna(thresh=int((1-threshold) * len(df)), axis=1)


In [109]:
#Filling columns with missing values
df['PRCP'].fillna(df['PRCP'].median(), inplace=True)

In [110]:
#Checking'PRCP' for NaNs
print(df.isnull().sum())

DATE            0
PRCP            0
TAVG         6376
TMAX       100914
TMIN       132058
COUNTRY         0
dtype: int64


In [111]:
# Group the data by the 'COUNTRY' column to fill missing value with mean of each country
grouped = df.groupby('COUNTRY')



In [112]:
# Fill missing values in 'TAVG', 'TMAX', and 'TMIN' using the mean of each group
df['TAVG'] = grouped['TAVG'].transform(lambda x: x.fillna(x.mean()))
df['TMAX'] = grouped['TMAX'].transform(lambda x: x.fillna(x.mean()))
df['TMIN'] = grouped['TMIN'].transform(lambda x: x.fillna(x.mean()))
#Round column values to 2dp
df['TAVG'] = df['TAVG'].round(2)
df['TMAX'] = df['TMAX'].round(2)
df['TMIN'] = df['TMIN'].round(2)

In [131]:
df.sample(30)

Unnamed: 0,DATE,PRCP,TAVG,TMAX,TMIN,COUNTRY,YEAR
336699,2012-06-22,0.0,88.0,78.51,75.0,Tunisia,2012
81688,1989-01-16,0.0,55.0,65.0,45.0,Egypt,1989
232570,2003-07-06,0.0,100.0,107.0,61.85,Egypt,2003
344157,2013-02-08,0.0,48.0,55.0,42.0,Tunisia,2013
62228,1987-01-17,0.0,47.0,55.0,57.17,Tunisia,1987
394277,2017-04-08,0.0,88.0,104.0,71.77,Senegal,2017
337183,2012-07-07,0.0,81.0,95.46,71.0,Senegal,2012
130327,1993-11-01,0.0,76.0,85.0,69.0,Egypt,1993
420601,2019-06-24,0.0,82.0,88.0,75.0,Egypt,2019
172608,1997-08-10,0.0,73.84,97.0,75.0,Egypt,1997


In [114]:
#Confirming all columns are filled
print(df.isnull().sum())

DATE       0
PRCP       0
TAVG       0
TMAX       0
TMIN       0
COUNTRY    0
dtype: int64


### Plot a line chart to show the average temperature fluctuations in Tunisia and Cameroon.

In [115]:
#Step 1
# Convert DATE column to datetime
df['DATE'] = df['DATE'].str.strip()
df['DATE'] = pd.to_datetime(df['DATE'], format='%Y%m%d %H%M%S')



In [None]:
#Tunisia
df['YEAR'] = df['DATE'].dt.year
selected_countries = ['Tunisia']
grouped_df = df[df['COUNTRY'].isin(selected_countries)].groupby(["YEAR","COUNTRY"]).agg({'TAVG':'mean'}).reset_index()
fig = px.line(grouped_df, x='YEAR', y='TAVG', color='COUNTRY', title='The Average Temperature Fluctuations in Tunisia', height=500, width=1000)
fig.show()


## Tunisia
### Result interpretation
Seeing these readings, I immediately inferred the temperatures must be in Farenheight, otherwise with temperatures this high, in Celcius, Tunisia would become a barren land, devioud of any trace of living creatures.
The lowest temperature dips occured between the years 1980 and 1991. Although temperatures appeared to be on a steady rise there were intermittent dips but never as low as the aforementioned years.  
Tunisia recored the highest average temperature in 1999 with an average temperature of 70.18, slightly higher than recorded in 2022. 

In [None]:
#Cameroon
df['YEAR'] = df['DATE'].dt.year
selected_countries = ['Cameroon']
grouped_df = df[df['COUNTRY'].isin(selected_countries)].groupby(["YEAR","COUNTRY"]).agg({'TAVG':'mean'}).reset_index()
fig = px.line(grouped_df, x='YEAR', y='TAVG', color='COUNTRY', title='The Average Temperature Fluctuations in Cameroon', height=500, width=1100)
fig.show()


## Cameroon
### Result Interpretation

Temperatures in Cameroon starts out relatively steady, with a rather noticeable spike in temperature from 1990 to 1991 which had the highest recorded average temperature of 83.40. After the year 2000, there's an apparent rising trend with fluctations, indicative of significant increase in average temperature; which continued to to rise over the last two decades.

## Comparative Analysis: 
### Tunisia vs Cameroon

 **General Trend**:

Cameroon experienced a more consistent and sharper increase in temperatures, after 2000. The line has more significant peaks and valleys, indicating higher variability in recent years.
Tunisia shows an increase in temperature until around 2000, after which temperatures stabilized with minor fluctuations.

**Peaks and Valleys**:

Both countries show significant peaks around the late 1980s to early 2000s. However, Cameroon’s chart has a sharper increase in recent years compared to Tunisia.
Tunisia's chart has a noticeable peak around 2000, which is followed by a stabilization, while Cameroon shows ongoing fluctuations and an overall upward trend.

**Temperature Levels**:

Cameroon’s average temperatures are higher than those in Tunisia across the entire time range.

### **Conclusion**:

Cameroon is experiencing more dramatic fluctuations and a more consistent upward trend in average temperatures, suggesting increasing temperatures possibly due to climate change or other environmental factors.
Tunisia, while having an upward trend, shows more stabilization in recent years, indicating a different temperature pattern or response to climatic factors compared to Cameroon.
This comparison highlights the regional differences in temperature trends, with Cameroon experiencing more variability and a steeper increase, whereas Tunisia has seen a peak and then a stabilization in average temperatures.











In [None]:
#Using data between 1980 and 2005, 

# Filter the data for selected countries and years between 1980 and 2005
selected_countries = ['Tunisia','Cameroon']
filtered_df = df[(df['COUNTRY'].isin(selected_countries)) & (df['YEAR'] >= 1980) & (df['YEAR'] <= 2005)]

# Group by year and country, then calculate the average temperature
grouped_df = filtered_df.groupby(["YEAR", "COUNTRY"]).agg({'TAVG':'mean'}).reset_index()

# Plot the line chart with customized labels
fig = px.line(
    grouped_df, 
    x='YEAR', 
    y='TAVG', 
    color='COUNTRY', 
    title='Average Temperature Fluctuations in Tunisia & Cameroon (1980-2005)',
    height=500, 
    width=1000,
    labels={
        'YEAR': 'Year',
        'TAVG': 'Average Temperature (°F)' 
    }
)

fig.show()



In [None]:
#Create Histograms to show temperature distribution in Senegal between [1980,2000] and [2000,2023]  

# Filter data for Senegal
df_senegal = df[df['COUNTRY'] == 'Senegal']

# Filter data for the two periods
df_senegal_1980_2000 = df_senegal[(df_senegal['DATE'] >= '1980-01-01') & (df_senegal['DATE'] < '2000-01-01')]
df_senegal_2000_2023 = df_senegal[(df_senegal['DATE'] >= '2000-01-01') & (df_senegal['DATE'] <= '2023-12-31')]

# Plot histograms
plt.figure(figsize=(14, 7))

# Histogram for 1980-2000
plt.subplot(1, 2, 1)
plt.hist(df_senegal_1980_2000['TAVG'].dropna(), bins=20, color='blue', alpha=0.7)
plt.title('Temperature Distribution in Senegal (1980-2000)')
plt.xlabel('Temperature (TAVG)')
plt.ylabel('Frequency')

# Histogram for 2000-2023
plt.subplot(1, 2, 2)
plt.hist(df_senegal_2000_2023['TAVG'].dropna(), bins=20, color='red', alpha=0.7)
plt.title('Temperature Distribution in Senegal (2000-2023)')
plt.xlabel('Temperature (TAVG)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
#Using the best chart,bar chart, to show the Average temperature per country.
# Calculate average temperature per country
avg_temp_per_country = df.groupby('COUNTRY')['TAVG'].mean()

# Plotting the bar chart
plt.figure(figsize=(12, 6))
avg_temp_per_country.plot(kind='bar', color='skyblue')
plt.title('Average Temperature per Country')
plt.xlabel('Country')
plt.ylabel('Average Temperature (TAVG)')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')

plt.show()


In [None]:
#Make your own questions about the dataset and try to answer them using the appropriate visuals.

# Box plot for maximum temperature distribution per country
plt.figure(figsize=(12, 6))
df.boxplot(column='TMAX', by='COUNTRY', grid=False, rot=45)
plt.title('Distribution of Maximum Temperatures by Country')
plt.suptitle('')
plt.xlabel('Country')
plt.ylabel('Maximum Temperature (TMAX)')

plt.show()


### Question 1

Are there any significant outliers in the temperature data for any of these countries?

### Answer 

Yes, all countries show outliers in the data. Particularly, Egypt and Tunisia have noticeable outliers on both the higher and lower ends, indicating extreme temperature events. Senegal also has a significant number of high outliers, suggesting occasional very hot days.

### Question 2

Based on this data, which country might experience the most stable weather in terms of maximum temperatures?

### Answer 

Angola and Cameroon might experience the most stable weather in terms of maximum temperatures, as indicated by the narrower interquartile range and fewer extreme outliers, suggesting less fluctuation in daily maximum temperatures.














In [None]:
#Creating area chart to visiualize precipitation trends across countries

# Group by year and country, then sum precipitation for each year
precipitation_trends = df.groupby(['YEAR', 'COUNTRY'])['PRCP'].sum().unstack()

# Plotting the area chart
plt.figure(figsize=(14, 8))
precipitation_trends.plot(kind='area', stacked=True, alpha=0.6, figsize=(14, 8))

plt.title('Precipitation Trends Across All Countries Over Time')
plt.xlabel('YEAR')
plt.ylabel('Total Precipitation (PRCP)')
plt.legend(title='Country', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

plt.show()


### Question 3
What insights can be derived from the precipitation trend shown in the area chart?

### Answer

#### Key Insights

Highest Increase: Senegal shows the highest increase in precipitation anomalies, indicating a significant rise in extreme weather events.
Senegal is likely to experience more extreme weather events such as heavy rainfall and potential flooding, all of which could significantly agriculture and water reources significantly.

Variability: Egypt and Angola exhibit high variability in their precipitation anomalies, suggesting a mix of droughts and heavy rainfall events. 
This could lead to chanleges in water management and agricultural planning.


Stability: Cameroon shows the most stable trend, which could lead to more predictable and manageable weather patterns.
Predictable weather patterns could be beneficial the country's water resources management and agriculture.

