# Installing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import calendar

# Data Loading and Cleaning

In [2]:
# Define the columns to be used
columns = ["yyyy", "mm", "tmax", "tmin", "af_days", "rain_mm", "sun_hours"]

# Function to clean and parse a line of data
def clean_line(line):
    line = line.strip()
    if "Provisional" in line:
        line = line.replace("Provisional", "").strip()
    parts = line.split()
    if len(parts) == 7:
        return parts
    else:
        return None

# Read the file and process the lines
data = []
with open('oxforddata.txt', 'r') as file:
    for line in file:
        cleaned_line = clean_line(line)
        if cleaned_line:
            data.append(cleaned_line)

# Convert the processed data into a DataFrame
df = pd.DataFrame(data, columns=columns)
df['sun_hours'] = df['sun_hours'].str.replace('*', '')
df['af_days'] = df['af_days'].str.replace('*', '')
df['tmax'] = df['tmax'].str.replace('*','')
df['tmin'] = df['tmin'].str.replace('*','')
df['rain_mm'] = df['rain_mm'].str.replace('*','')
df = df.drop(index=0)
df = df.apply(pd.to_numeric, errors='coerce') 
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'oxforddata.txt'

In [None]:
df.tail()

# EDA For Missing Values

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
import seaborn as sns

# Using Seaborn for scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='mm', y='sun_hours', data=df)
plt.title('Scatter Plot with Seaborn')
plt.xlabel('Months')
plt.ylabel('Sun Hours')
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='mm', y='sun_hours', data=df, palette='Set2')
plt.title('Box Plot of Sun Hours by Months')
plt.xlabel('Months')
plt.ylabel('Sun Hours')
plt.grid(True)
plt.show()


In [None]:
imputer = KNNImputer(n_neighbors=5)
df_imputed = imputer.fit_transform(df)
df = pd.DataFrame(df_imputed, columns=df.columns)

plt.figure(figsize=(10, 6))
sns.boxplot(x='mm', y='sun_hours', data=df, palette='Set2')
plt.title('Box Plot of Sun Hours by Months')
plt.xlabel('Months')
plt.ylabel('Sun Hours')
plt.grid(True)
plt.show()

In [None]:
df.isnull().sum()

## FEATURE ENGINEERING

In [None]:
df['temp_range'] = df['tmax'] - df['tmin']
df['avg_temp'] = (df['tmax'] + df['tmin']) / 2

In [None]:
df["yyyy"] = df["yyyy"].astype(int)
df["mm"] = df["mm"].astype(int)

In [None]:
def assign_season(month):
    if month in [12, 1, 2]:
        return 1  # Winter
    elif month in [3, 4, 5]:
        return 2  # Spring
    elif month in [6, 7, 8]:
        return 3  # Summer
    elif month in [9, 10, 11]:
        return 4  # Autumn

df['season'] = df['mm'].apply(assign_season)

In [None]:
def calculate_days_in_month(row):
    year = int(row['yyyy'])
    month = int(row['mm'])
    return calendar.monthrange(year, month)[1]

df['days_in_month'] = df.apply(calculate_days_in_month, axis=1)


In [None]:
df['montly_temp_variation'] = (df['tmax'] - df['tmin'])/ df['tmin']
df['frost_day_props'] = df['af_days'] / df['days_in_month']

In [None]:
df

## Time Series

In [None]:
df.set_index('yyyy', inplace=True)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['avg_temp'], marker='o')
plt.title('Average Temperature Over Time')
plt.xlabel('Date')
plt.ylabel('Average Temperature')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Example: Seasonal decomposition of average temperature
result = seasonal_decompose(df['avg_temp'], model='additive', period=12)  # assuming yearly seasonality (period=12)
result.plot()
plt.show()
