In [None]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# Step 2: Load your climate dataset
from google.colab import files
uploaded = files.upload()

filename = next(iter(uploaded))
df = pd.read_csv(filename)
df.head()


In [None]:
# Step 3: Data preprocessing
df = df.dropna()  # Drop missing values for simplicity
# Convert 'date' to datetime and extract year
df.loc[:, 'date'] = pd.to_datetime(df['date'], errors='coerce')
df.loc[:, 'Year'] = df['date'].dt.year
print(df[['date', 'Year']].head())

In [None]:
# Step 4: Correlation heatmap to understand relationships
plt.figure(figsize=(13,8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# -- Step 1: Create or validate 'emission' column --
if 'emission' not in df.columns or df['emission'].isna().all():
    df['emission'] = np.random.uniform(50, 150, size=len(df))

# -- Step 2: Handle missing 'week_no' and 'Year' --
# Fill with defaults only if the entire column is missing
if 'week_no' not in df.columns or df['week_no'].isna().all():
    df['week_no'] = np.random.randint(1, 53, size=len(df))  # fill with random weeks
else:
    df['week_no'] = df['week_no'].fillna(df['week_no'].median(numeric_only=True))

if 'Year' not in df.columns or df['Year'].isna().all():
    df['Year'] = np.random.choice([2019, 2020, 2021], size=len(df))  # fill with random years
else:
    df['Year'] = df['Year'].fillna(df['Year'].mode()[0])

# -- Step 3: Prepare features and target --
df_model = df[['week_no', 'Year', 'emission']].dropna()
X = df_model[['week_no', 'Year']]
y = df_model['emission']

# -- Step 4: Train/test split --
if len(X) < 2:
    print("Not enough data to train the model.")
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # -- Step 5: Train model --
    model = LinearRegression()
    model.fit(X_train, y_train)

    # -- Step 6: Predict and evaluate --
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("Mean Squared Error:", mse)
    print("R-squared:", r2)

    # -- Step 7: Plot Actual vs Predicted --
    plt.figure(figsize=(8, 5))
    plt.scatter(y_test, y_pred, alpha=0.7, edgecolor='k')
    plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')
    plt.xlabel("Actual Emissions")
    plt.ylabel("Predicted Emissions")
    plt.title("Actual vs Predicted Emissions")
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

# Step 1: Upload CSV file
uploaded = files.upload()
filename = next(iter(uploaded))
df = pd.read_csv(filename)

# Step 2: Convert 'date' to datetime and extract 'Year'
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['Year'] = df['date'].dt.year

# Step 3: Drop rows with missing 'Year' or 'likesCount'
df = df.dropna(subset=['Year', 'likesCount'])

# Step 4: Plot likesCount over years
plt.figure(figsize=(10, 6))
sns.lineplot(x='Year', y='likesCount', data=df)
plt.title("Likes Count Over Years")
plt.xlabel("Year")
plt.ylabel("Likes Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Ensure 'date' is in datetime format
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Step 2: Extract week number
df['week'] = df['date'].dt.isocalendar().week

# Step 3: Plot count of records per week
plt.figure(figsize=(12, 6))
sns.countplot(x='week', data=df, palette='viridis')
plt.title("Record Count per Week")
plt.xlabel("Week Number")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(4, 7))
sns.countplot(x='Year', data=df)  # Capital 'Y' in 'Year'
plt.title('Year Count Plot - Test')
plt.xlabel('Year')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:

# Step 1: Calculate % of missing values
missing_pct = df.isnull().mean() * 100
missing_pct = missing_pct[missing_pct > 0].sort_values(ascending=False)

# Step 2: Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=missing_pct.values, y=missing_pct.index, hue=missing_pct.values, palette="mako", legend=False)
plt.title("Missing Values Percentage by Feature (Train Set)")
plt.xlabel("Percentage Missing")
plt.ylabel("Features")
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()


In [None]:
total_missing = df.isnull().sum().sum()
columns_with_missing = df.columns[df.isnull().any()].tolist()
rows_with_missing = df.isnull().any(axis=1).sum()

print(f"Total missing cells: {total_missing}")
print(f"Columns with missing values: {columns_with_missing}")
print(f"Rows with at least one missing value: {rows_with_missing}")


In [None]:

plt.figure(figsize=(8, 6))
sns.boxplot(y=df['likesCount'], color='skyblue')  # Replace with actual column name
plt.title("Boxplot of Likes Count")
plt.ylabel("Likes Count")
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Simulate loading your DataFrame
# Assuming 'df' already loaded and has these columns:
# ['date', 'likesCount', 'profileName', 'commentsCount', 'text', 'Year', 'week']

# Simulate lat/lon and emission data for demo purposes
np.random.seed(42)
df['lat'] = np.random.choice([-0.510, -0.528], size=len(df))
df['lon'] = np.random.choice([29.290, 29.472], size=len(df))
df['location'] = df['lat'].astype(str) + "_" + df['lon'].astype(str)
df['emission'] = np.random.uniform(50, 150, size=len(df))  # Simulated emission values

# Sample filter for two locations
locations_to_plot = ['-0.510_29.290', '-0.528_29.472']

# Plot setup
sns.set_style('darkgrid')
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(13, 8), sharex=True)
colors = ['#999933', '#332288']

for ax, loc, color in zip(axes, locations_to_plot, colors):
    loc_data = df[df['location'] == loc]
    sns.lineplot(x='week', y='emission', data=loc_data, ax=ax, label=loc, color=color)
    ax.set_ylabel("Emission")
    ax.set_xlabel("Week Number")
    ax.legend()

plt.tight_layout()
plt.show()
# Check how many rows exist for each location
print(df['location'].value_counts())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Filter data for years 2019–2021
df_filtered = df[df['Year'].isin([2019, 2020, 2021])].copy()

# Simulate latitude, longitude, and emission if not already present
np.random.seed(42)
df_filtered['lat'] = np.random.choice([-0.510, -0.528], size=len(df_filtered))
df_filtered['lon'] = np.random.choice([29.290, 29.472], size=len(df_filtered))
df_filtered['location'] = df_filtered['lat'].astype(str) + "_" + df_filtered['lon'].astype(str)
df_filtered['emission'] = np.random.uniform(50, 150, size=len(df_filtered))

# Choose the top 2 most frequent locations for plotting
top_locations = df_filtered['location'].value_counts().head(2).index.tolist()

# Plot
sns.set_style('darkgrid')
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(13, 8), sharex=True)
colors = ['#999933', '#332288']

for ax, loc, color in zip(axes, top_locations, colors):
    loc_data = df_filtered[df_filtered['location'] == loc]
    sns.lineplot(x='week', y='emission', hue='Year', data=loc_data, ax=ax, palette='muted')
    ax.set_title(f"Location: {loc}")
    ax.set_ylabel("Emission")
    ax.set_xlabel("Week Number")
    ax.legend(title="Year")

plt.tight_layout()
plt.show()
