In [4]:
import pandas as pd

news_data = {
    'headline': [
        "Severe floods devastate villages in Western Nepal",
        "Government launches reforestation project in Central Nepal",
        "Drought conditions worsen affecting crops in Terai",
        "Researchers warn of accelerated glacial melt in Himalayas",
        "Community builds flood barriers to protect farmland"
    ]
}

news_df = pd.DataFrame(news_data)
news_df


Unnamed: 0,headline
0,Severe floods devastate villages in Western Nepal
1,Government launches reforestation project in C...
2,Drought conditions worsen affecting crops in T...
3,Researchers warn of accelerated glacial melt i...
4,Community builds flood barriers to protect far...


In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords if not already done
nltk.download('punkt')
nltk.download('stopwords')

# Check versions
import sklearn
import nltk
print("NLTK version:", nltk.__version__)
print("sklearn version:", sklearn.__version__)


NLTK version: 3.9.1
sklearn version: 1.6.1


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Sample corpus simulating climate change reports/news
data = {
    'Document_ID': [1, 2, 3],
    'Text': [
        "Heavy rainfall and flooding in Nepal have increased dramatically due to climate change.",
        "Glacial melt is accelerating, posing severe risks to agriculture and local communities.",
        "Recent policies aim to mitigate deforestation and reduce greenhouse gas emissions."
    ]
}

text_df = pd.DataFrame(data)
text_df


Unnamed: 0,Document_ID,Text
0,1,Heavy rainfall and flooding in Nepal have incr...
1,2,"Glacial melt is accelerating, posing severe ri..."
2,3,Recent policies aim to mitigate deforestation ...


Text Cleaning & Preprocessing

In [9]:
import nltk
nltk.download('punkt')

# Manually load the tokenizer once to avoid further lookup errors
from nltk.tokenize import PunktSentenceTokenizer
tokenizer = PunktSentenceTokenizer()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
from nltk.tokenize import TreebankWordTokenizer
treebank_tokenizer = TreebankWordTokenizer()

text_df['Tokens'] = text_df['Clean_Text'].apply(lambda x: treebank_tokenizer.tokenize(x))


In [12]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Read your text data (assuming your CSV has columns "Date" and "Text")
text_df = pd.read_csv('text_data.csv')

# Clean text function
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', str(text).lower())
    return text

# Apply cleaning
text_df['Clean_Text'] = text_df['Text'].apply(clean_text)

# Tokenization using TreebankWordTokenizer (avoids punkt errors)
tokenizer = TreebankWordTokenizer()
text_df['Tokens'] = text_df['Clean_Text'].apply(lambda x: tokenizer.tokenize(x))

# Remove stopwords
stop_words = set(stopwords.words('english'))
text_df['Tokens_NoStop'] = text_df['Tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Lemmatization
lemmatizer = WordNetLemmatizer()
text_df['Lemmas'] = text_df['Tokens_NoStop'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

# Preview the result
text_df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Date,Text,Clean_Text,Tokens,Tokens_NoStop,Lemmas
0,1/1/2001,Forest loss may increase due to excessive rain...,forest loss may increase due to excessive rain...,"[forest, loss, may, increase, due, to, excessi...","[forest, loss, may, increase, due, excessive, ...","[forest, loss, may, increase, due, excessive, ..."
1,5/15/2005,Deforestation is slowing down in the western r...,deforestation is slowing down in the western r...,"[deforestation, is, slowing, down, in, the, we...","[deforestation, slowing, western, region]","[deforestation, slowing, western, region]"
2,7/20/2010,Community forestry efforts have reduced tree c...,community forestry efforts have reduced tree c...,"[community, forestry, efforts, have, reduced, ...","[community, forestry, efforts, reduced, tree, ...","[community, forestry, effort, reduced, tree, c..."
3,9/10/2015,Increased awareness campaigns are helping to p...,increased awareness campaigns are helping to p...,"[increased, awareness, campaigns, are, helping...","[increased, awareness, campaigns, helping, pre...","[increased, awareness, campaign, helping, prev..."
4,12/5/2020,Recent data shows fluctuations in precipitatio...,recent data shows fluctuations in precipitatio...,"[recent, data, shows, fluctuations, in, precip...","[recent, data, shows, fluctuations, precipitat...","[recent, data, show, fluctuation, precipitatio..."


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Join tokens back to text (since TF-IDF expects strings)
text_df['Processed_Text'] = text_df['Clean_Text']

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=100)  # Limit to top 100 terms to avoid too many features

# Fit and transform
tfidf_matrix = tfidf.fit_transform(text_df['Processed_Text'])

# Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'TFIDF_{word}' for word in tfidf.get_feature_names_out()])

# Combine with the original dataframe (keeping Date)
final_text_features = pd.concat([text_df[['Date']], tfidf_df], axis=1)

# Preview
final_text_features.head()


Unnamed: 0,Date,TFIDF_and,TFIDF_are,TFIDF_awareness,TFIDF_campaigns,TFIDF_community,TFIDF_cover,TFIDF_data,TFIDF_deforestation,TFIDF_down,...,TFIDF_rainfall,TFIDF_recent,TFIDF_reduced,TFIDF_region,TFIDF_shows,TFIDF_slowing,TFIDF_the,TFIDF_to,TFIDF_tree,TFIDF_western
0,1/1/2001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.388684,0.0,0.0,0.0,0.0,0.0,0.0,0.313587,0.0,0.0
1,5/15/2005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.361529,0.361529,...,0.0,0.0,0.0,0.361529,0.0,0.361529,0.361529,0.0,0.0,0.361529
2,7/20/2010,0.0,0.0,0.0,0.0,0.378823,0.378823,0.0,0.0,0.0,...,0.0,0.0,0.378823,0.0,0.0,0.0,0.0,0.0,0.305632,0.0
3,9/10/2015,0.0,0.36228,0.36228,0.36228,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292285,0.0,0.0
4,12/5/2020,0.36228,0.0,0.0,0.0,0.0,0.0,0.36228,0.0,0.0,...,0.0,0.36228,0.0,0.0,0.36228,0.0,0.0,0.0,0.292285,0.0


In [15]:
from textblob import TextBlob

# Function to calculate polarity
def get_polarity(text):
    return TextBlob(text).sentiment.polarity

# Apply polarity score
text_df['Sentiment_Polarity'] = text_df['Clean_Text'].apply(get_polarity)

# Preview
text_df[['Date', 'Sentiment_Polarity']].head()


Unnamed: 0,Date,Sentiment_Polarity
0,1/1/2001,-0.1875
1,5/15/2005,-0.077778
2,7/20/2010,0.0
3,9/10/2015,0.0
4,12/5/2020,0.0


In [16]:
# Extract year from Date
text_df['Year'] = pd.to_datetime(text_df['Date']).dt.year

# Group by Year and calculate average sentiment
annual_sentiment = text_df.groupby('Year')['Sentiment_Polarity'].mean().reset_index()

# Preview
annual_sentiment.head()


Unnamed: 0,Year,Sentiment_Polarity
0,2001,-0.1875
1,2005,-0.077778
2,2010,0.0
3,2015,0.0
4,2020,0.0


In [23]:
features_df = pd.read_csv('final_features.csv')
print(features_df.columns)


Index(['Year', 'scaled_total_tc_loss', 'scaled_tc_loss_lag1',
       'scaled_Precipitation'],
      dtype='object')


In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# ----------------------------------------
# STEP 1: Load the final features and sentiment score
# ----------------------------------------

# Load feature dataset
features_df = pd.read_csv('final_features.csv')

# Load sentiment scores
annual_sentiment = pd.read_csv('annual_sentiment.csv')

# Merge sentiment into features_df
features_df = pd.merge(features_df, annual_sentiment, on='Year', how='left')

print("Columns after merge:")
print(features_df.columns)

# ----------------------------------------
# STEP 2: Prepare Features (X) and Target (y)
# ----------------------------------------

# Define feature columns and target column
X = features_df[['scaled_tc_loss_lag1', 'scaled_Precipitation', 'Sentiment_Score']]
y = features_df['scaled_total_tc_loss']

# ----------------------------------------
# STEP 3: Split the data into Train and Test
# ----------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------------------------
# STEP 4: Train Random Forest Regressor
# ----------------------------------------

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# ----------------------------------------
# STEP 5: Make Predictions
# ----------------------------------------

y_pred = rf_model.predict(X_test)

# ----------------------------------------
# STEP 6: Calculate Evaluation Metrics
# ----------------------------------------

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"\n--- Model Performance ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")


Columns after merge:
Index(['Year', 'scaled_total_tc_loss', 'scaled_tc_loss_lag1',
       'scaled_Precipitation', 'Sentiment_Score'],
      dtype='object')

--- Model Performance ---
Mean Absolute Error (MAE): 0.9608
Root Mean Squared Error (RMSE): 1.0265
R-squared (R²): 0.4677


In [32]:
# Load sentiment scores
annual_sentiment = pd.read_csv('annual_sentiment.csv')

# Check to confirm it's loaded correctly
print(annual_sentiment)

# Merge sentiment into features_df
features_df = pd.merge(features_df, annual_sentiment, on='Year', how='left')

# Confirm the merge
print("Columns after merge:")
print(features_df.columns)


   Year  Sentiment_Score
0  2001        -0.187500
1  2005        -0.077778
2  2010         0.000000
3  2015         0.000000
4  2020         0.000000
Columns after merge:
Index(['Year', 'scaled_total_tc_loss', 'scaled_tc_loss_lag1',
       'scaled_Precipitation', 'Sentiment_Score_x', 'Sentiment_Score_y'],
      dtype='object')


In [36]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the correct dataset that contains Sentiment_Score
features_df = pd.read_csv('final_features.csv')

# Check columns to confirm
print(features_df.columns)



Index(['Year', 'scaled_total_tc_loss', 'scaled_tc_loss_lag1',
       'scaled_Precipitation'],
      dtype='object')


In [37]:
numeric_features = ['total_tc_loss', 'tc_loss_lag1', 'Precipitation', 'Sentiment_Score']

scaler = StandardScaler()
scaled_values = scaler.fit_transform(features_df[numeric_features])

scaled_df = pd.DataFrame(scaled_values, columns=[f'scaled_{col}' for col in numeric_features])
scaled_df['Year'] = features_df['Year']

print(scaled_df.head())


KeyError: "None of [Index(['total_tc_loss', 'tc_loss_lag1', 'Precipitation', 'Sentiment_Score'], dtype='object')] are in the [columns]"

In [38]:
print(features_df.columns)


Index(['Year', 'scaled_total_tc_loss', 'scaled_tc_loss_lag1',
       'scaled_Precipitation'],
      dtype='object')


In [39]:
from sklearn.preprocessing import StandardScaler

# Scale Sentiment_Score separately
scaler_sent = StandardScaler()
features_df['scaled_Sentiment_Score'] = scaler_sent.fit_transform(features_df[['Sentiment_Score']])

# Check the final dataframe
print(features_df.columns)
features_df.head()


KeyError: "None of [Index(['Sentiment_Score'], dtype='object')] are in the [columns]"