<a href="https://colab.research.google.com/github/Wachiputi/ACPP/blob/model-ACPP/CommodityPriceProjection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**IMPORTING THE FRAMEWORKS AND TOOLS**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import math
%matplotlib inline

In [None]:
import numpy.random as rnd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

In [None]:
#this line of code is importing three classes from the sklearn (scikit-learn) library in Python. These classes are used for creating custom machine learning models and transformers.
from sklearn.base import BaseEstimator,TransformerMixin,RegressorMixin

In [None]:
#This line of code sets the random seed to 42 using the rnd.seed() function.
#Setting the random seed ensures that the sequence of random numbers generated by the code remains the same every time it's run
#making the code's behavior deterministic.

rnd.seed(42)

##**LOADING THE DATA**

In [None]:
data1 = pd.read_csv('/content/wfp_food_prices_mwi.csv')
data1.head(5)

In [None]:
# Drop row at position 0
data = data1.drop(data1.index[0])
data.head(5)

In [None]:
data.dropna(inplace=True)

##converting the categorical values to float and data to datetime

In [None]:
# Convert 'date' column to datetime
data['date'] = pd.to_datetime(data['date'])

# Convert 'price' column to float
data['price'] = pd.to_numeric(data['price'])

# Convert 'longitude' and 'latitude' columns to float
data['longitude'] = pd.to_numeric(data['longitude'])
data['latitude'] = pd.to_numeric(data['latitude'])
data['usdprice'] = pd.to_numeric(data['usdprice'])

# If there are any specific units that need conversion (e.g., from meters to kilometers),
# you would need to perform additional operations to convert the values accordingly.

# Example:
# Convert latitude and longitude from degrees to radians
data['longitude'] = data['longitude'].apply(math.radians)
data['latitude'] = data['latitude'].apply(math.radians)

In [None]:
data.info()

##SHOWS THAT ALL THE DATA IS CATEGORICAL,one of them is  Commodity Let's see what values it contain.

In [None]:
data['commodity'].value_counts()

In [None]:
#The describe() function in pandas is used to generate descriptive statistics of a DataFrame (data)
data.describe()

In [None]:
#This line of code creates histograms for each numeric column in the DataFrame data. The hist() function in pandas is used to create histograms,
#and by specifying bins=50, it divides the range of values into 50 equal-width bins.
#The figsize=(15,10) parameter adjusts the size of the resulting figure to be 15 inches wide and 10 inches tall.
data.hist(bins=50,figsize=(15,10))

#**GETTING THE DATASET**



*  Used Stratified sampling technique
*  Defined a new feature price_cat which is income category and used if for sampling




In [None]:
data['price_cat']=np.ceil(data['price']/1.5)
data['price_cat'].where(data['price_cat']<5,5.0,inplace=True)

In [None]:
# Convert 'income_cat' column to categorical data type
data['price_cat'] = pd.Categorical(data['price_cat'])

# Identify classes with low frequencies
class_counts = data['price_cat'].value_counts()
low_frequency_classes = class_counts[class_counts < 2].index.tolist()

# Combine low frequency classes into a single class
data['price_cat'] = data['price_cat'].replace(low_frequency_classes, 'Other')

# Check the updated distribution of classes
print(data['price_cat'].value_counts())


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

this code is creating a stratified split of the dataset into training and testing sets, ensuring that the distribution of categories in the 'income_cat' column is similar in both sets. This is important for training and evaluating machine learning models to ensure that they generalize well to unseen data.

In [None]:
# Check the indices generated by StratifiedShuffleSplit
print("Train indices:", train_index)
print("Test indices:", test_index)

# Check if the DataFrame contains any data
print("Data empty?", data.empty)

# Verify column names
print("Column names in data:", data.columns)
print("Is 'price_cat' in columns?", 'price_cat' in data.columns)

# Reset DataFrame index if needed
# data.reset_index(inplace=True)

# Check for duplicate indices
print("Duplicates in data:", data.index.duplicated().any())


In [None]:
data.reset_index(drop=True, inplace=True)

In [None]:
#it's dividing the dataset into training and testing sets while ensuring that the proportion of different categories in the 'income_cat' column remains similar in both sets.

data['price_cat'] = pd.to_numeric(data['price_cat'], errors='coerce')  # 'coerce' will convert non-numeric values to NaN

# Drop rows with NaN values in 'income_cat' column
data.dropna(subset=['price_cat'], inplace=True)

# Now, perform the Stratified Shuffle Split
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data['price_cat']):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [None]:
strat_train_set.drop('price_cat',axis=1,inplace=True)
strat_test_set.drop('price_cat',axis=1,inplace=True)

#This operation is commonly done after the stratified shuffle split when the column used for stratification (in this case, 'income_cat') is no longer needed for training or testing the model.
#Removing this column ensures that the model does not learn any spurious patterns related to this variable during training.

In [None]:
strat_train_set.to_csv("strat_train_set.csv",index=False)
strat_test_set.to_csv("strat_test_set.csv",index=False)

#By saving the DataFrames to CSV files, you can store the data in a format that can be easily loaded into various tools and libraries for further analysis, visualization, or modeling.
#The index=False argument ensures that the CSV files do not include an additional column for DataFrame indices.

#**Exploratory Data Analysis**

In [None]:
data=pd.read_csv('strat_train_set.csv')
#data.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
data.info()

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(x=data['longitude'],y=data['latitude'])
plt.title("Distribution of commodities",size=16)

##By setting alpha=0.1, we can see high density areas. When alpha=0.1, the plotted elements (such as points or areas) are mostly transparent, making it easier to distinguish regions with high density because they will appear darker due to overlapping points.

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(x=data['longitude'],y=data['latitude'],alpha=0.1)
plt.title("Distribution of commodities",size=16)

This plot visualizes the geographical distribution of commodities, with each point representing a location and its size indicating the quantity of the commodity at that location.the legend provides a key for interpreting the colors used in the plot.

The level of transparency (alpha) of the data points affects the visibility of overlapping points. Higher transparency allows you to see through overlapping points and observe patterns more clearly, while lower transparency may obscure details in densely populated areas.

 this code snippet generates a scatter plot visualizing the distribution of prices across geographical locations, with each point representing a location and its size indicating the price at that location. The color of each point represents the price value, with the color bar providing a reference for interpreting the price range

In [None]:
plt.figure(figsize=(12,12))

plt.scatter(x=data['longitude'],y=data['latitude'],alpha=0.5,s=data['price']/30,
            cmap=plt.get_cmap("jet"),zorder=1,label='price')
plt.colorbar()
plt.title("Distribution of prices",size=16)
plt.legend()

From above plot we can infer that,


*   commodity prices are much related to location





##**SELECTION OF FEATURES**

In [None]:
selected_features = ['date', 'commodity', 'price', 'longitude', 'latitude', 'market']

# Filter the data based on selected features
filtered_data = df[selected_features]

# Filter the data to include only dates from 2023 to 2024
filtered_data = filtered_data[(df['date'] >= '2023-07-01') & (df['date'] <= '2024-02-31')]

# Define the specific commodities you want to include
specific_commodities = ['Maize (new harvest)', 'Beans','Cowpeas']  # Replace with your specific commodities

# Filter the data to include only the specific commodities
filtered_data = filtered_data[filtered_data['commodity'].isin(specific_commodities)]

# Display the filtered data
filtered_data

In [None]:
df.columns

In [None]:
df = df[['date', 'admin1', 'admin2', 'market', 'latitude', 'longitude',
       'category', 'commodity', 'unit', 'priceflag', 'pricetype', 'currency',
       'price', 'usdprice']]
df.head()

In [None]:
df.isnull().sum()

In [None]:
# checking for duplicated rows

df.duplicated().sum()

In [None]:
df.info()

In [None]:
catvars = df.select_dtypes(include=['object']).columns
numvars = df.select_dtypes(include = ['int32','int64','float32','float64']).columns

catvars,numvars

In [None]:
def uniquevals(col):
    print(f'Details of the particular col {col} is : {df[col].unique()}')

def valuecounts(col):
    print(f'Valuecounts of the particular col {col} is : {df[col].value_counts()}')




for col in df.columns:
    uniquevals(col)
    print("-"*75)

# Exploratory Data Analysis

In [None]:
# viewing the distribution of the price column

sn.displot(filtered_data['price'],color='red')