In [1]:
import os
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import xgboost as xgb

In [2]:
os.getcwd()

'c:\\Users\\Rafimbi\\Documents\\RYDAMTAANI'

In [3]:
os.chdir("C:\\Users\\Rafimbi\\Documents\\Python Projects")

In [4]:
table = pd.read_excel("Cashcropsdata.xlsx")

In [7]:
#View first few rows
table.head()

Unnamed: 0,gender,value_chain,county,applied_for_agric_loan,hhsize,hh_income,total_land,fcs
0,Male,Crop production,Embu,No,7,8000,1.0,65.5
1,Male,Milk and Livestock production,Kiambu,No,2,60000,2.0,81.0
2,Male,Crop production,Busia,No,7,10000,1.0,50.5
3,Female,Milk and Livestock production,Kiambu,Yes,2,20000,2.0,83.5
4,Male,Crop production,Embu,No,2,10000,0.25,36.0


In [None]:
#XGBoost only works with numbers, so we convert categories into numeric codes
for col in ['gender', 'value_chain', 'county', 'applied_for_agric_loan']:
    table[col] = table[col].astype('category').cat.codes #astype('category')Converts the column into a categorical data type (instead of object or string).
#.cat.codes → Replaces each category with a numeric code:
#Assign back to Table[col] → Updates the original column so it’s now numeric, ready for models like XGBoost.

In [15]:
table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  435 non-null    int8   
 1   value_chain             435 non-null    int8   
 2   county                  435 non-null    int8   
 3   applied_for_agric_loan  435 non-null    int8   
 4   hhsize                  435 non-null    int64  
 5   hh_income               435 non-null    int64  
 6   total_land              435 non-null    float64
 7   fcs                     435 non-null    float64
dtypes: float64(2), int64(2), int8(4)
memory usage: 15.4 KB


In [42]:
#Split train_data = Table.sample(frac=0.8, random_state=42)
train_data = table.sample(frac=0.8, random_state=42)
test_data = table.drop(train_data.index)
X_train = train_data.drop(columns=['fcs'])
y_train = train_data['fcs']



In [64]:
#Remaining 20% as testing data
test_data = table.drop(train_data.index)
X_test = test_data.drop(columns=['fcs'])
y_test = test_data['fcs']


In [43]:
#check how many rows are in the training set
print("xgb:", train_data)

xgb:      gender  value_chain  county  applied_for_agric_loan  hhsize  hh_income  \
280       1            0       1                       1       6       4000   
78        0            0       1                       0       3      30000   
113       1            0      10                       0       6       5000   
253       1            0       1                       1       3      14000   
324       1            0       1                       0       5       5000   
..      ...          ...     ...                     ...     ...        ...   
216       1            1      10                       0       8      10000   
279       1            1      10                       0       4      50000   
377       1            0       1                       0       4      10000   
337       1            0       8                       0       3      28000   
236       1            1      10                       1       8       4500   

     total_land   fcs  
280        0.75  33.5 

In [44]:
#check how many rows are in the test set
print("xgb:", test_data)

xgb:      gender  value_chain  county  applied_for_agric_loan  hhsize  hh_income  \
1         1            2       3                       0       2      60000   
13        1            2       4                       0       5      15000   
20        1            0       0                       0       7       3000   
21        0            0       6                       0      10      30000   
34        1            0       6                       0       3       2000   
..      ...          ...     ...                     ...     ...        ...   
420       1            0       5                       0       2      25000   
425       1            0       8                       1       5       5000   
428       1            0      10                       0       6      50000   
432       1            1      10                       0      12      25000   
434       1            2       3                       0       4      25000   

     total_land   fcs  
1          2.00  81.0 

In [45]:
#make predictions on the test
model = xgb.XGBRegressor(
    objective='reg:squarederror',  # Regression task
    n_estimators=100,              # Number of boosting rounds
    learning_rate=0.1,             # Step size
    max_depth=5,                   # Tree depth
    random_state=42
)

model.fit(X_train, y_train)


In [None]:
#Separate features & target for training set
X_train = train_data[['gender', 'value_chain', 'county', 'applied_for_agric_loan',
                      'hhsize', 'hh_income', 'total_land']]
y_train = train_data['fcs']

In [54]:
#Separate features & target for testing set
X_train = test_data[['gender', 'value_chain', 'county', 'applied_for_agric_loan',
                      'hhsize', 'hh_income', 'total_land']]
y_train = test_data['fcs']

In [55]:
#Make predictions
predictions = model.predict(X_test)

In [56]:
#evaluate model performance - calculate manually mse
mse = sum((y_test - predictions)**2) / len(y_test)
print(f"Mean Squared Error (MSE): {mse:.2f}")


Mean Squared Error (MSE): nan


In [None]:
#low MSE → Model is predicting fcs well.
#high MSE → Predictions are off
#Encoding categorical variables → Converts gender, value_chain, county, and applied_for_agric_loan to numeric codes so XGBoost can process them.
#Splitting data → .sample(frac=0.8) randomly selects 80% of rows for training. The rest are used for testing.
#Selecting features → Only the given independent variables are kept for X_train.
#Fitting the model → .fit(X_train, y_train) trains the XGBoost model using the training data only.