# Topic: Climate-Resilient Farming Practice

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import Argricultural Survey data
raw_data = pd.read_csv('./Dataset/data.csv', low_memory=False)
raw_data.head

In [3]:
# Store names of columns externally to select needed columns
with open('columns.txt', 'w') as file:
    for item in raw_data.columns:
        file.write('%s\n' % item)

# Section information

Section 1: Household Roster--Members of Households and Education
Section 2: Employment
Section 3: Tenure Issues and Labor Composition
Section 4: Details on Farming Activities
Section 5: Access and Extension Services 
Section 6: Other Farming Costs and Farm Subsidies
Section 7: Adaptation Options

In [4]:
# Create a list of relevant section files containing section columns
sections = ['section_3_col.txt', 'section_4_col.txt', 'section_7_col.txt']
sec_cols = {}
# Iterate through the list of section file names
for file_name in sections:
    # Open the txt file in read mode
    with open(file_name, 'r') as file:
        # Read lines from the file and create a list
        columns = [column.strip() for column in file.readlines()]
        # Remove the '.txt' extension and save the list with the corresponding file name
        sec_cols[file_name[:-4]] = columns
# Print the list of columns needed    
print(sec_cols)

In [5]:
# Create a list of each section's columns using the key(section name) as the identifies and values(column names) as items in the list
# Iterate through the dictionary items
for key, value in sec_cols.items():
    # Create variables with key names and assign them the corresponding list values
    locals()[key] = value

## Section 3 - Tenure issues

In [6]:
# Filter raw data to contain columns needed for section 3
tenure_data = raw_data[section_3_col]
# Print tenure_data
pd.set_option('display.max_columns', None)
print(tenure_data)

In [7]:
# Calculate the percentage of NaN values in each column
na_percentage = (tenure_data.isna().mean() * 100)

# Drop columns with more than 80% NaN values
columns_to_drop = na_percentage[na_percentage > 80].index
tenure_data = tenure_data.drop(columns=columns_to_drop)

In [8]:
# Calculate sum of missing data in each column
tenure_data.isna().sum()

In [9]:
# Confirm all plot areas are in the same unit
tenure_data['plotunits'].describe()

In [10]:
tenure_data.columns

In [11]:
# Create an empty dataframe to contain selected tenure features
tenure_features = pd.DataFrame()

In [12]:
# Create a list of streamlined columns
columns =['hhcode', 'adm0', 'adm1', 'farmtype', 'fplotarea1', 'fsystem1',
       'tenure1', 'yearsuse1', 'rentplot1', 'season1s', 'season1e', 'season2s', 'season2e', 'seas1nam', 'seas2nam']
# Filter the tenure_data to store streamlined columns in tenure_features
tenure_features = tenure_data[columns]

In [13]:
# Get value_counts of the columns to spot odd values
cat_cols = tenure_features.columns
for col in cat_cols:
    print(tenure_features[col].value_counts(dropna=False))

In [14]:
tenure_features['adm1'] = tenure_features['adm1'].str.lower()
with pd.option_context('display.max_rows', None):
    print(tenure_features.groupby('adm0')['adm1'].value_counts(dropna=False).sort_index())

In [15]:
tenure_features['adm1'] = tenure_features['adm1'].replace('bborng-ahafo','brong-ahafo')

In [16]:
# Convert odd values not within categotical range to 'Other', 5
tenure_features.loc[:, 'fsystem1'] = tenure_features['fsystem1'].apply(lambda x: x if (x in range(1, 7) or pd.isna(x)) else 5)

# Convert odd values not within categotical range to 'Other', 7
tenure_features.loc[:, 'tenure1'] = tenure_features['tenure1'].apply(lambda x: x if (x in range(1, 8) or pd.isna(x)) else 7)

In [17]:
tenure_features['seas1nam'] = tenure_features['seas1nam'].str.lower()
tenure_features['seas2nam'] = tenure_features['seas2nam'].str.lower()

replacements = {
    'long rain' : 'long rains',
    'short rain' : 'short rains',
    '0' : 'others',
    '3' : 'others',
    '3meher' : 'others',
    '3 meher' : 'others',
    'beley' : 'others',
    'belg' : 'others',
    '3-meher' : 'others',
    '3,1beley' : 'others',
    'oct' : 'others',
    '4' : 'others',
    'belg3,1' : 'others',
    '2-belg' : 'others',
    '1belge' : 'others',
    '99' : 'others',
    'belg(1)' : 'others',
    '3belg' : 'others',
    '2- belg' : 'others',
    '1,2' : 'others',
    '999' : 'others'
}
tenure_features['seas1nam'] = tenure_features['seas1nam'].replace(replacements)
tenure_features['seas2nam'] = tenure_features['seas2nam'].replace(replacements)

## Section 4 - Details on Farming Activities

In [18]:
# Filter raw data to contain columns needed for section 4
crop_data = raw_data[section_4_col]

# Print tenure_data
print(crop_data)

In [19]:
# Calculate sum of missing data in each column
with pd.option_context('display.max_rows', None):
    print(crop_data.isna().sum())

In [20]:
# Use regex to drop columns not needed
import re

# Define regex partern to match columns to drop
regex_pattern = r's3|p2|c2|c3|c4|c5|c6'

#use filter and regex to drop columns matching the pattern
plot_cols = [col for col in crop_data.columns if re.search(regex_pattern, col)]
crop_data = crop_data.drop(columns=plot_cols)
print(plot_cols)

In [21]:
# Calculate the percentage of NaN values in each column
na_percentage = (crop_data.isna().mean() * 100)

# Drop columns with more than 80% NaN values
columns_to_drop = na_percentage[na_percentage > 80].index
crop_data = crop_data.drop(columns=columns_to_drop)

print(crop_data)

In [22]:
print(crop_data.columns)

In [23]:
# Create an empty dataframe, crop_features to contain selected tenure features
crop_features = pd.DataFrame()
crop_features = crop_data

In [24]:
# Get value_counts of the binary columns to spot odd values not 1/0/1.0/0.0
cat_cols = crop_features.columns
for col in cat_cols:
    print(crop_features[col].value_counts(dropna=False))

In [25]:
cat_crop_cols = ['s1p1c1', 's2p1c1', 'pc1']
for col in cat_crop_cols:
    # Convert odd values not within categotical range to 'Other', 56
    crop_features.loc[:, col] = crop_features[col].apply(lambda x: x if (x in range(1, 57) or pd.isna(x)) else 56)

cat_mkt_col = ['s1p1c1mkt', 's2p1c1mkt']
for col in cat_crop_cols:
    # Convert odd values not within categotical range to 'Other', 7
    crop_features.loc[:, col] = crop_features[col].apply(lambda x: x if (x in range(1, 8) or pd.isna(x)) else 7)
    
crop_features.loc[:, 'transport'] = crop_features['transport'].apply(lambda x: x if (x in range(1, 7) or pd.isna(x)) else 6)

cat_wat_col = ['s1p1wat1', 's1p1wat2', 's1p1wat3', 's1p1wat4', 's1p1wat5']
for col in cat_crop_cols:
    # Convert odd values not within categotical range to 'Other', 5
    crop_features.loc[:, col] = crop_features[col].apply(lambda x: x if (x in range(1, 6) or pd.isna(x)) else 5)

cat_irrig_col = ['s1p1irrig1', 's1p1irrig2', 's1p1irrig3', 's1p1irrig4']
for col in cat_crop_cols:
    # Convert odd values not within categotical range to 'Other', 4
    crop_features.loc[:, col] = crop_features[col].apply(lambda x: x if (x in range(1, 5) or pd.isna(x)) else 4)

## Section 7- Climate Adaptation

In [26]:
# Filter raw data to contain columns needed
climate_data = raw_data[section_7_col]
# Print tenure_data
print(climate_data)

In [27]:
# Calculate the percentage of NaN values in each column
na_percentage = (climate_data.isna().mean() * 100)

# Drop columns with more than 80% NaN values
columns_to_drop = na_percentage[na_percentage > 80].index
climate_data = climate_data.drop(columns=columns_to_drop)

print(climate_data)

In [28]:
# Calculate sum of missing data in each column
with pd.option_context('display.max_rows', None):
    print(climate_data.isna().sum())

In [29]:
# Get value_counts of the binary columns to spot odd values not 1/0/1.0/0.0
for col in climate_data.columns[2:]:
    print(climate_data[col].value_counts(dropna=False))

In [30]:
binary_columns = climate_data.columns[2:]

def binary_convert(column):
    try:
        value = int(column)
        if value in [1, 0]:
            return value
        else:
            return np.nan
    except (ValueError, TypeError):
        return np.nan

for column in binary_columns:
    # Convert non-binary values to NaN
    climate_data[column] = climate_data[column].apply(binary_convert)

# All columns have binary values in the format 1 and 0, handling odd digits
print(climate_data)

In [31]:
# Create an empty dataframe to contain selected tenure features
climate_features = pd.DataFrame()

In [32]:
climate_features = climate_data

In [33]:
climate_features.info()

In [34]:
climate_features.to_csv('climate_data.csv', index=False)

In [35]:
all_data = pd.merge(pd.merge(tenure_features, crop_features, on='hhcode'), climate_features, on='hhcode', how='right')
all_data.head()

In [36]:
all_data.info()

In [37]:
with pd.option_context('display.max_rows', None):
    print(all_data.isna().sum())

In [38]:
# Get value_counts of the columns inspect
columns = all_data.columns
for col in columns:
    print(all_data[col].value_counts(dropna=False))

In [39]:
print(columns[:77])

In [40]:
from dateutil import parser
def extract_month(date_string):
    try:
        parsed_date = parser.parse(date_string)
        month_name = parsed_date.strftime('%B')  # Extract full month name
        return month_name
    except (TypeError, ValueError):
        return pd.NA  # Return NaN for invalid dates

# Apply the function to the date columns and extract the month
date_cols = ['season1s', 'season1e', 'season2s', 'season2e', 's1p1c1plant', 's1p1c1harv', 's2p1c1plant', 's2p1c1harv']
for col in date_cols:
    all_data[col] = all_data[col].apply(extract_month)

# Print the DataFrame with extracted months and NaN for invalid dates
print(all_data)

In [41]:
# Define the pattern to remove from column names
patterns_to_remove = ['p1c1', 'p1']

# Get the list of columns to process
columns_to_process = all_data.columns

# Remove the specified patterns from column names
new_columns = [col for col in columns_to_process]
for pattern in patterns_to_remove:
    new_columns = [col.replace(pattern, '') for col in new_columns]

# Rename the columns in the DataFrame
all_data.columns = new_columns


In [42]:
all_data.columns[:70]

In [43]:
model_data = all_data.copy()

In [44]:
water_replacement_dict = {
    0 : 5
}

irrig_replacement_dict = {
    0 : 4
}

market_replacement_dict = {
    0 : 3
}

seasname_replacement_dict = {
    '-99' : pd.NA,
    '-999' : pd.NA,
    '.' : 3
}


# Replace column values using the replacement dictionaries
water_col = ['s1wat1', 's1wat2', 's1wat3', 's1wat4', 's1wat5', 's2wat1', 's2wat2', 's2wat3', 's2wat4', 's2wat5']
irrig_col = ['s1irrig1', 's1irrig2', 's1irrig3', 's1irrig4', 's2irrig1', 's2irrig2', 's2irrig3', 's2irrig4']
season_col = ['seas1nam', 'seas2nam']
market_col = ['s1mkt', 's2mkt']
for col in water_col:
    model_data[col] = model_data[col].replace(water_replacement_dict)
for col in irrig_col:
    model_data[col] = model_data[col].replace(irrig_replacement_dict)
for col in season_col:
    model_data[col] = model_data[col].replace(seasname_replacement_dict)
for col in market_col:
    model_data[col] = model_data[col].replace(market_replacement_dict)


In [45]:
column_mapping = {
    'adm0' : 'Country',
    'adm1' : 'Region',
    'season1s' : 's1start',
    'season1e' : 's1end',
    'season2s' : 's2start',
    'season2e' : 's2end',
    's1' : 'crop1',
    's1plant' : 's1plant_data',
    's1harv' : 's1harv_date',
    's1area' : 's1land_area',
    's1qharv' : 's1quant_harv',
    's1cons' : 's1consumed',
    's1lives' : 's1livestock',
    's1lost' : 's1lost',
    's1mkt' : 's1market',
    's1sold' : 's1quant_sold',
    's1cval' : 's1crop_val',
    's1seed' : 's1no_seed',
    's1sval' : 's1seed_cost',
        's2' : 'crop2',
    's2plant' : 's2plant_data',
    's2harv' : 's2harv_date',
    's2area' : 's2land_area',
    's2qharv' : 's2quant_harv',
    's2cons' : 's2consumed',
    's2lives' : 's2livestock',
    's2lost' : 's2lost',
    's2mkt' : 's2market',
    's2sold' : 's2quant_sold',
    's2cval' : 's2crop_val',
    's2seed' : 's2no_seed',
    's2sval' : 's2seed_cost'
}

model_data.rename(columns = column_mapping, inplace=True)

In [46]:
model_data.to_csv('./Dataset/model_data.csv', index=False)

In [47]:
analysis_data = all_data.copy()

In [48]:
analysis_data.columns[:70]

In [49]:
### Dictionary with replacement column names
crop_replacement_dict = {
    1 : 'alfalfa',
    2 : 'banana',
    3 : 'barley',
    4 : 'beans',
    5 : 'cashew',
    6 : 'cassava',
    7 : 'citrus fruit',
    8 : 'chickpeas',
    9 : 'clover',
    10 : 'cocoa',
    11 : 'cocoyam',
    12 : 'cowpea',
    13 : 'coffee',
    14 : 'cotton',
    15 : 'cucumber',
    16 : 'enset',
    17 : 'field pea',
    18 : 'flax',
    19 : 'garden-eggs',
    20 : 'garlic',
    21 : 'grape',
    22 : 'groundnut',
    23 : 'kola',
    24 : 'lentil',
    25 : 'mango',
    26 : 'maize',
    27 : 'millet',
    28 : 'oil palm',
    29 : 'okra',
    30 : 'onion',
    31 : 'palm dates',
    32 : 'paprika',
    33 : 'peanuts',
    34 : 'pepper',
    35 : 'pigeon pea',
    36 : 'pineapple',
    37 : 'plantain',
    38 : 'potato',
    39 : 'rice',
    40 : 'safflower',
    41 : 'sesame',
    42 : 'shallots',
    43 : 'sheanut',
    44 : 'sorghum',
    45 : 'soybean',
    46 : 'spinach',
    47 : 'squash',
    48 : 'sugarcane',
    49 : 'sunflower',
    50 : 'tea',
    51 : 'tef',
    52 : 'tobacco',
    53 : 'tomato',
    54 : 'wheat',
    55 : 'yam',
    56 : 'other'
}

water_replacement_dict = {
    1 : 'irrigated major scheme',
    2 : 'irrigated minor scheme',
    3 : 'irrigated groundwater',
    4 : 'rain-fed',
    5 : 'other',
    0 : 'other'
}

irrig_replacement_dict = {
    1 : 'gravity',
    2 : 'sprinklers',
    3 : 'drip systems',
    4 : 'other',
    0 : 'other'
}

market_replacement_dict = {
    1 : 'Directly to consumers',
    2 : 'Middleman/wholesale',
    3 : 'Other',
    4 : 'Combination',
    0 : 'Other'
}

seasname_replacement_dict = {
    '1' : 'winter season',
    '2' : 'summer season',
    '3' : 'others',
    '-99' : pd.NA,
    '-999' : pd.NA,
    '.' : 'other'
}

tenure_replacement_dict = {
    1 : 'Own land and use',
    2 : 'Own land and rent',
    3 : 'Sharecropped land',
    4 : 'Communal land',
    5 : 'Rented land',
    6 : 'Borrowed land',
    7 : 'Other',
}

farmsys_replacement_dict = {
    1 : 'Shifting cultivation',
    2 : 'Continuous cropping',
    3 : 'CC with multiple rotations',
    4 : 'Livestock grazing land',
    5 : 'Other',
    6 : 'Combination',

}

farmtype_replacement_dict = {
    1: 'small-scale',
    2: 'medium scale',
    3: 'large-scale',
}

transport_replacement_dict = {
    1 : 'walk',
    2 : 'animal',
    3 : 'cart/bicycle',
    4 : 'motorized vehicle',
    6 : 'combination',
    6 : 'other',
}


# Replace column values using the replacement dictionaries
crop_col = ['s1', 's2', 'pc1']
water_col = ['s1wat1', 's1wat2', 's1wat3', 's1wat4', 's1wat5', 's2wat1', 's2wat2', 's2wat3', 's2wat4', 's2wat5']
irrig_col = ['s1irrig1', 's1irrig2', 's1irrig3', 's1irrig4', 's2irrig1', 's2irrig2', 's2irrig3', 's2irrig4']
season_col = ['seas1nam', 'seas2nam']
market_col = ['s1mkt', 's2mkt']
for col in crop_col:
    analysis_data[col] = analysis_data[col].replace(crop_replacement_dict)
for col in water_col:
    analysis_data[col] = analysis_data[col].replace(water_replacement_dict)
for col in irrig_col:
    analysis_data[col] = analysis_data[col].replace(irrig_replacement_dict)
for col in season_col:
    analysis_data[col] = analysis_data[col].replace(seasname_replacement_dict)
for col in market_col:
    analysis_data[col] = analysis_data[col].replace(market_replacement_dict)
analysis_data['fsystem1'] = analysis_data['fsystem1'].replace(farmsys_replacement_dict)
analysis_data['tenure1'] = analysis_data['tenure1'].replace(tenure_replacement_dict)
analysis_data['farmtype'] = analysis_data['farmtype'].replace(tenure_replacement_dict)
analysis_data['transport'] = analysis_data['transport'].replace(tenure_replacement_dict)


In [50]:
# Get value_counts of the columns to inspect
columns = analysis_data.columns
for col in columns:
    print(analysis_data[col].value_counts(dropna=False))

In [51]:
analysis_data.to_csv('./Dataset/analysis_data.csv', index=False)