# VI. Chatbot Implementation
## Chapters 
- I. How the chatbot works
- II. Conversion of Correlation Matrix into knowledge.txt

## I. How it works

Our chatbot implementation uses Open AI's gpt-3.5-turbo model as our engine to analyze the correlation matrix. Before we do that we have to convert our correlation_matrix.csv into knowledge.txt that will be used to train our Correlation Analysis

## II. Conversion

I. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

In [3]:
correlation = pd.read_csv("./VI.Chatbot/correlation_matrix.csv")
correlation

Unnamed: 0.1,Unnamed: 0,age_pyramid_total_nyc,median_household_income_nyc,median_personal_earnings_nyc,no_hs_nyc,at_least_hs_nyc,complete_hs_somecollege_nyc,complete_bach_nyc,at_least_bachelors_nyc,grad_degree_nyc,...,veterans_med_income_nyc,veterans_poverty_nyc,veterans_poverty_percent_nyc,veterans_poverty_disabled_nyc,veterans_poverty_disabled_percent_nyc,vet_total_nyc_18_64,veterans_unemployed_nyc,veterans_unemployed_percent_nyc,veterans_disabled_nyc,veterans_disabled_percent_nyc
0,age_pyramid_total_nyc,1.000000,-0.746539,-0.721950,0.662630,-0.662630,0.471650,-0.695634,-0.632688,-0.542627,...,-0.303707,-0.370139,-0.747646,-0.485747,-0.714641,0.631425,0.850547,0.628440,0.390758,-0.030474
1,median_household_income_nyc,-0.746539,1.000000,0.972589,-0.961426,0.961426,-0.588893,0.805786,0.881985,0.905028,...,0.719134,-0.106758,0.509542,-0.080271,0.266695,-0.871934,-0.302724,0.003386,-0.716601,-0.126211
2,median_personal_earnings_nyc,-0.721950,0.972589,1.000000,-0.988286,0.988286,-0.729969,0.905428,0.953633,0.947565,...,0.597950,-0.044521,0.562569,-0.137251,0.204925,-0.955594,-0.286138,0.053649,-0.718335,-0.125271
3,no_hs_nyc,0.662630,-0.961426,-0.988286,1.000000,-1.000000,0.746818,-0.896998,-0.968028,-0.981896,...,-0.634712,0.117927,-0.521895,0.178736,-0.175675,0.953129,0.192194,-0.150513,0.648040,0.016984
4,at_least_hs_nyc,-0.662630,0.961426,0.988286,-1.000000,1.000000,-0.746818,0.896998,0.968028,0.981896,...,0.634712,-0.117927,0.521895,-0.178736,0.175675,-0.953129,-0.192194,0.150513,-0.648040,-0.016984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,vet_total_nyc_18_64,0.631425,-0.871934,-0.955594,0.953129,-0.953129,0.893675,-0.978392,-0.991252,-0.951186,...,-0.495229,-0.101522,-0.658606,0.119419,-0.195615,1.000000,0.210839,-0.154806,0.594734,0.016754
193,veterans_unemployed_nyc,0.850547,-0.302724,-0.286138,0.192194,-0.192194,0.152005,-0.338704,-0.189244,-0.043910,...,0.155224,-0.567241,-0.610661,-0.685949,-0.736418,0.210839,1.000000,0.932222,0.120299,0.019524
194,veterans_unemployed_percent_nyc,0.628440,0.003386,0.053649,-0.150513,0.150513,-0.192022,0.023734,0.175731,0.304323,...,0.335171,-0.511948,-0.354601,-0.713005,-0.650296,-0.154806,0.932222,1.000000,-0.064083,0.050457
195,veterans_disabled_nyc,0.390758,-0.716601,-0.718335,0.648040,-0.648040,0.226652,-0.470740,-0.530264,-0.556545,...,-0.375757,0.288019,-0.067438,0.480629,0.271819,0.594734,0.120299,-0.064083,1.000000,0.768516


Correlation Legend : 
 - Close to 1 (strong positive correlation) 
 - Close to -1 (strong negative correlation)
 - Close to 0 indicate a lack of correlation.

# Code to convert CSV into a knowledge.txt


In [6]:
def extract_significant_correlations(csv_file_path, threshold=0.8):
    # Load the correlation matrix
    correlation_matrix = pd.read_csv(csv_file_path)

    # Finding significant correlations
    significant_correlations = {}
    for column in correlation_matrix.columns[1:]:
        for index, value in enumerate(correlation_matrix[column]):
            if abs(value) >= threshold and abs(value) < 1:  # Ignoring perfect correlations
                row_label = correlation_matrix.iloc[index, 0]
                if row_label != column:  # Avoiding duplicate entries
                    significant_correlations[(row_label, column)] = value

    return significant_correlations

def format_correlations_to_text(correlations):
    text = "Significant Correlations:\n"
    for ((var1, var2), corr_value) in correlations.items():
        correlation_type = "Positive" if corr_value > 0 else "Negative"
        text += f"Between '{var1}' and '{var2}': {correlation_type} Correlation ({corr_value:.2f})\n"
    return text


def create_knowledge_text(csv_file_path, output_file_path, threshold=0.8):
    correlations = extract_significant_correlations(csv_file_path, threshold)
    text = format_correlations_to_text(correlations)
    with open(output_file_path, 'w') as file:
        file.write(text)

# Example usage
csv_file_path = './VI.Chatbot/correlation_matrix.csv'  # Replace with your CSV file path
output_file_path = 'knowledge.txt'
create_knowledge_text(csv_file_path, output_file_path)
