In [14]:
import pandas as pd

# Load the train and test datasets
train_df = pd.read_csv('kiva_train.csv')
test_df = pd.read_csv('kiva_test.csv')

# Display the first few rows of the train dataset to understand its structure
train_df.head(), train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5454 entries, 0 to 5453
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           5454 non-null   int64 
 1   country      5454 non-null   object
 2   en           5454 non-null   object
 3   gender       5454 non-null   object
 4   loan_amount  5454 non-null   int64 
 5   nonpayment   5454 non-null   object
 6   sector       5454 non-null   object
 7   status       5454 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 341.0+ KB


(   id             country                                                 en  \
 0   1             Ecuador  <h4>Business Description</h4> \n <p> Don Mauro...   
 1   2  Dominican Republic  Rosa Iris is a brilliant entrepreneur who sell...   
 2   3               Kenya  Sirote is married with six children. Two of he...   
 3   4               Kenya  David Mwangi Kimani  is 33 years old and marri...   
 4   5  Dominican Republic  Nilda is a very persistent woman who has learn...   
 
   gender  loan_amount nonpayment       sector  status  
 0      M          825     lender         Food       1  
 1      F          450    partner       Retail       0  
 2      F          600     lender  Agriculture       1  
 3      M          650     lender         Food       1  
 4      F          325    partner         Food       0  ,
 None)

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report
import numpy as np
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Ollie/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [19]:
# Function to apply sentiment analysis on the 'en' column
def sentiment_analysis(text_series):
    analyzer = SentimentIntensityAnalyzer()
    return text_series.apply(lambda x: analyzer.polarity_scores(x)['compound']).values.reshape(-1, 1)

# Split the features and target
X = train_df.drop(columns=['id', 'status'])
y = train_df['status']

# Identify categorical and numerical columns
categorical_cols = ['country', 'gender', 'nonpayment', 'sector']
numerical_cols = ['loan_amount']

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('sentiment', FunctionTransformer(sentiment_analysis), 'en'),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
            ('num', 'passthrough', numerical_cols)
        ])),
    ('scaler', StandardScaler(with_mean=False)),  # Set with_mean=False to avoid the error
    ('model', LogisticRegression(random_state=42, max_iter=1000))
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = pipeline.predict(X_val)
y_val_proba = pipeline.predict_proba(X_val)[:, 1]

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("Precision:", precision_score(y_val, y_val_pred))
print("Recall:", recall_score(y_val, y_val_pred))
print("ROC-AUC Score:", roc_auc_score(y_val, y_val_proba))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

# Prepare the test dataset and make predictions
X_test = test_df.drop(columns=['id'])
test_predictions = pipeline.predict_proba(X_test)[:, 1]

# Create the output DataFrame with id and predicted status
output_df = pd.DataFrame({
    'id': test_df['id'],
    'status': test_predictions
})

# Save the output to a CSV file
output_file = 'kiva_predictions.csv'
output_df.to_csv(output_file, index=False)
output_file

Accuracy: 0.8762603116406966
Precision: 0.8911290322580645
Recall: 0.8451242829827916
ROC-AUC Score: 0.9141632779468396

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88       568
           1       0.89      0.85      0.87       523

    accuracy                           0.88      1091
   macro avg       0.88      0.88      0.88      1091
weighted avg       0.88      0.88      0.88      1091



'kiva_predictions.csv'