# Objective: Analyze and summarize reasons for investment choices.

In [1]:
import pandas as pd

# Load the dataset
file_path = 'Data_set.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(df.head())

# Extract the reason columns
reasons_equity = df['Reason_Equity']
reasons_mutual = df['Reason_Mutual']
reasons_bonds = df['Reason_Bonds']
reasons_fd = df['Reason_FD']

# Display the first few entries of each column
print("Equity Reasons:\n", reasons_equity.head())
print("Mutual Funds Reasons:\n", reasons_mutual.head())
print("Bonds Reasons:\n", reasons_bonds.head())
print("Fixed Deposits Reasons:\n", reasons_fd.head())


   gender  age Investment_Avenues  Mutual_Funds  Equity_Market  Debentures  \
0  Female   34                Yes             1              2           5   
1  Female   23                Yes             4              3           2   
2    Male   30                Yes             3              6           4   
3    Male   22                Yes             2              1           3   
4  Female   24                 No             2              1           3   

   Government_Bonds  Fixed_Deposits  PPF  Gold  ...           Duration  \
0                 3               7    6     4  ...          1-3 years   
1                 1               5    6     7  ...  More than 5 years   
2                 2               5    1     7  ...          3-5 years   
3                 7               6    4     5  ...   Less than 1 year   
4                 6               4    5     7  ...   Less than 1 year   

  Invest_Monitor   Expect       Avenue What are your savings objectives?  \
0        M

In [2]:
#clean and process the data
from collections import Counter
import re

# Define a preprocessing function
def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove short words (e.g., a, an, the)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing to each Reasons column
cleaned_equity = reasons_equity.dropna().apply(preprocess)
cleaned_mutual = reasons_mutual.dropna().apply(preprocess)
cleaned_bonds = reasons_bonds.dropna().apply(preprocess)
cleaned_fd = reasons_fd.dropna().apply(preprocess)


In [4]:
# Tokenize the text data and count word frequencies for each column
all_words_equity = ' '.join(cleaned_equity).split()
all_words_mutual = ' '.join(cleaned_mutual).split()
all_words_bonds = ' '.join(cleaned_bonds).split()
all_words_fd = ' '.join(cleaned_fd).split()

word_freq_equity = Counter(all_words_equity)
word_freq_mutual = Counter(all_words_mutual)
word_freq_bonds = Counter(all_words_bonds)
word_freq_fd = Counter(all_words_fd)


In [5]:

# Display the most common words for each column
common_words_equity = word_freq_equity.most_common(10)
common_words_mutual = word_freq_mutual.most_common(10)
common_words_bonds = word_freq_bonds.most_common(10)
common_words_fd = word_freq_fd.most_common(10)

print("Common Equity Reasons:\n", common_words_equity)
print("Common Mutual Funds Reasons:\n", common_words_mutual)
print("Common Bonds Reasons:\n", common_words_bonds)
print("Common Fixed Deposits Reasons:\n", common_words_fd)


Common Equity Reasons:
 [('capital', 30), ('appreciation', 30), ('dividend', 8), ('liquidity', 2)]
Common Mutual Funds Reasons:
 [('better', 24), ('returns', 24), ('fund', 13), ('diversification', 13), ('tax', 3), ('benefits', 3)]
Common Bonds Reasons:
 [('assured', 26), ('returns', 26), ('safe', 13), ('investment', 13), ('tax', 1), ('incentives', 1)]
Common Fixed Deposits Reasons:
 [('risk', 19), ('free', 19), ('fixed', 18), ('returns', 18), ('high', 3), ('interest', 3), ('rates', 3)]
