In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load the dataset from CSV file
df = pd.read_csv('DeforestationData.csv')

# Remove non-numeric columns (index and county name)
df = df.iloc[:, 2:]

# Remove commas from population column and convert to float
df['Population Census 2009'] = df['Population Census 2009'].str.replace(',', '').astype(float)
df['Area (km2)'] = df['Area (km2)'].str.replace(',', '').astype(float)
df['Forest Cover Percentage'] = df['Forest Cover Percentage'].str.replace('%', '').astype(float)
df['Tree cover loss (2001-20) %'] = df['Tree cover loss (2001-20) %'].str.replace('%', '').astype(float)

# Separate input features (X) and target variable (y)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train a linear regression model on the training data
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Use the trained model to make predictions on the test data
y_pred = regressor.predict(X_test)

# Append the predicted values to the original dataframe
df_pred = X_test.copy()
df_pred['Predicted Tree cover loss (2001-20) %'] = y_pred

# Save the predicted values to a CSV file
df_pred.to_csv('PredictedDeforestationData.csv', index=False)

df_pred

Unnamed: 0,Area (km2),Population Census 2009,Urbanisation per County,Forest Cover Percentage,Predicted Tree cover loss (2001-20) %
28,2884.5,752965.0,14,16.8,9.530964
33,21292.7,687312.0,41,7.14,10.228441
30,8696.1,399227.0,25,6.79,10.846537
4,6497.7,101539.0,20,33.9,13.446076
18,2361.0,693558.0,25,38.03,12.753194
11,6930.1,1356301.0,12,18.31,6.568159
42,3154.7,963794.0,14,2.59,7.732985
31,7509.5,1603325.0,46,9.29,9.501245
22,71597.8,855399.0,14,4.06,-1.053428
10,25336.1,143294.0,44,5.34,11.985943


<h2> Calculating the accuracy of the model </h2>

In [10]:
# Calculate and print the accuracy of the predictions
accuracy = regressor.score(X_test, y_test)
print('Accuracy of the predictions: {:.2%}'.format(accuracy))

Acuracy of the prediction: 81.06%


In [3]:
df

Unnamed: 0,Area (km2),Population Census 2009,Urbanisation per County,Forest Cover Percentage,Tree cover loss (2001-20) %
0,212.5,939370.0,100,5.12,26.3
1,8270.3,649931.0,23,5.44,20.9
2,12245.9,1109735.0,26,7.67,15.6
3,35375.8,240075.0,15,6.21,16.9
4,6497.7,101539.0,20,33.9,8.6
5,17083.9,284657.0,15,3.63,2.9
6,45720.2,623060.0,24,7.09,0.6
7,55840.6,661941.0,15,1.94,0.2
8,25797.7,1025756.0,18,3.04,0.0
9,66923.1,291166.0,22,1.7,0.1


<h2><center> Sorting the predicted data arcoding to priority</center> </h2>

In [18]:
# Sort the 'column_name' column from highest to lowest
df = df.sort_values('Tree cover loss (2001-20) %', ascending=False)

df

Unnamed: 0,Area (km2),Population Census 2009,Urbanisation per County,Forest Cover Percentage,Tree cover loss (2001-20) %
26,2955.3,894179.0,39,7.55,32.6
0,212.5,939370.0,100,5.12,26.3
31,7509.5,1603325.0,46,9.29,25.5
32,17921.2,850920.0,7,16.66,21.6
1,8270.3,649931.0,23,5.44,20.9
3,35375.8,240075.0,15,6.21,16.9
30,8696.1,399227.0,25,6.79,15.9
27,3049.7,369998.0,14,37.49,15.8
2,12245.9,1109735.0,26,7.67,15.6
40,2496.1,842304.0,11,0.42,13.2


<h2> The data above is saved from the most vulnrable counties to the least vulnrable</h2>