#Preprocessing the data

First stage is data preprocessing, in which we will be removing the stop words , tokenizing the words, do the stemming the process as well as the handling the missing values

In [None]:
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Function to preprocess text data
def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # Tokenize the text
    words = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Join the preprocessed words back into a text
    preprocessed_text = ' '.join(words)

    return preprocessed_text

# Function to read data from a CSV file
def read_data_from_csv(input_filename):
    data = []
    with open(input_filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        for row in reader:
            row_dict = {}
            for i, val in enumerate(row):
                row_dict[header[i]] = val
            data.append(row_dict)
    return data

# Function to write data to a CSV file
def write_data_to_csv(output_filename, data):
    with open(output_filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Leader', 'preprocessed_text'])
        for row in data:
            writer.writerow([row['Leader'], row['preprocessed_text']])

# Read data from the input CSV file
input_data = read_data_from_csv('covid_statements.csv')

# Initialize an empty list to store preprocessed data
output_data = []

# Loop through each row in the input data
for row in input_data:
    id = row['\ufeffLeader']
    preprocessed_text = preprocess_text(row['statement'])
    found = False

    # Check if the leader ID is already in the output_data list
    for i in range(len(output_data)):
        if id.lower() == output_data[i]['Leader'].lower():
            # If found, append the preprocessed text to the existing data
            output_data[i]['preprocessed_text'] += '\n' + preprocessed_text
            found = True
            break

    # If leader ID is not found, add a new entry to the output_data list
    if not found:
        output_data.append({'Leader': id, 'preprocessed_text': preprocessed_text})

# Write the preprocessed data to a new CSV file
write_data_to_csv('covid_statements_preprocessed.csv', output_data)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


#Feature selection Algorithms

Information gain

In [None]:
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile, mutual_info_classif

# Function to process a single data row and extract features using mutual information
def process_single_data(input_row):
    # Extract the preprocessed text from the input row
    preprocessed_text = input_row['preprocessed_text']

    # Initialize CountVectorizer to convert the text into a bag-of-words representation
    vectorizer = CountVectorizer()
    features = vectorizer.fit_transform([preprocessed_text])

    # Use mutual information for feature selection
    selector = SelectPercentile(mutual_info_classif, percentile=50)
    selected_features = selector.fit_transform(features, [preprocessed_text])

    # Get the indices of the selected features
    feature_indices = selector.get_support(indices=True)

    # Get the names of all features
    selected_feature_names = vectorizer.get_feature_names_out()

    # Filter the selected feature names based on the indices
    selected_feature_names = [selected_feature_names[idx] for idx in feature_indices]

    return selected_feature_names

# Function to read data from a CSV file
def read_data_from_csv(input_filename):
    data = []
    with open(input_filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        for row in reader:
            row_dict = {}
            for i, val in enumerate(row):
                row_dict[header[i]] = val
            data.append(row_dict)
    return data

# Read data from the preprocessed CSV file
input_data = read_data_from_csv('covid_statements_preprocessed.csv')

# Process each row of data and extract features
for i in range(len(input_data)):
    selected_feature_names = process_single_data(input_data[i])

# Write the selected features to a new CSV file
with open('information_gain.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Leader', 'information_gain_features'])
    for i in range(len(input_data)):
        row = input_data[i]
        selected_feature_names = process_single_data(row)
        writer.writerow([row['Leader'], ' '.join(selected_feature_names)])


Chi-Squared Test

In [None]:
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# Function to process a single data row and extract features using chi-squared feature selection
def process_single_data(input_row):
    # Extract the preprocessed text from the input row
    preprocessed_text = input_row['preprocessed_text']

    # Initialize CountVectorizer to convert the text into a bag-of-words representation
    vectorizer = CountVectorizer()
    features = vectorizer.fit_transform([preprocessed_text])

    # Use chi-squared feature selection with k=4 (select top 4 features)
    selector = SelectKBest(chi2, k=4)
    selected_features = selector.fit_transform(features, [preprocessed_text])

    # Get the indices of the selected features
    feature_indices = selector.get_support(indices=True)

    # Get the names of all features
    selected_feature_names = vectorizer.get_feature_names_out()

    # Filter the selected feature names based on the indices
    selected_feature_names = [selected_feature_names[idx] for idx in feature_indices]

    return selected_feature_names

# Function to read data from a CSV file
def read_data_from_csv(input_filename):
    data = []
    with open(input_filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        for row in reader:
            row_dict = {}
            for i, val in enumerate(row):
                row_dict[header[i]] = val
            data.append(row_dict)
    return data

# Read data from the preprocessed CSV file
input_data = read_data_from_csv('covid_statements_preprocessed.csv')

# Process each row of data and extract features
for i in range(len(input_data)):
    selected_feature_names = process_single_data(input_data[i])

# Write the selected features to a new CSV file
with open('chi_squared.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Leader', 'chi_squared_features'])
    for i in range(len(input_data)):
        row = input_data[i]
        selected_feature_names = process_single_data(row)
        writer.writerow([row['Leader'], ' '.join(selected_feature_names)])


cfs

In [None]:
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.feature_selection import RFE
from sklearn.feature_selection import GenericUnivariateSelect

# Function to process a single data row and extract features using mutual information feature selection
def process_single_data(input_row):
    # Extract the preprocessed text from the input row
    preprocessed_text = input_row['preprocessed_text']

    # Initialize CountVectorizer to convert the text into a bag-of-words representation
    vectorizer = CountVectorizer(max_features=30)
    features = vectorizer.fit_transform([preprocessed_text])

    # Use mutual information feature selection with k='all' (select all features)
    selector = SelectKBest(mutual_info_classif, k='all')
    selected_features = selector.fit_transform(features, [preprocessed_text])

    # Get the indices of the selected features
    feature_indices = selector.get_support(indices=True)

    # Get the names of all features
    selected_feature_names = vectorizer.get_feature_names_out()

    # Filter the selected feature names based on the indices
    selected_feature_names = [selected_feature_names[idx] for idx in feature_indices]

    return selected_feature_names

# Function to read data from a CSV file
def read_data_from_csv(input_filename):
    data = []
    with open(input_filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        for row in reader:
            row_dict = {}
            for i, val in enumerate(row):
                row_dict[header[i]] = val
            data.append(row_dict)
    return data

# Read data from the preprocessed CSV file
input_data = read_data_from_csv('covid_statements_preprocessed.csv')

# Process each row of data and extract features
for i in range(len(input_data)):
    selected_feature_names = process_single_data(input_data[i])

# Write the selected features to a new CSV file
with open('cfs.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Leader', 'cfs_features'])
    for i in range(len(input_data)):
        row = input_data[i]
        selected_feature_names = process_single_data(row)
        writer.writerow([row['Leader'], ' '.join(selected_feature_names)])


PCC

In [None]:
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, f_classif

# Function to process a single data row and extract features using PCC (Pearson Correlation Coefficient) feature selection
def process_single_data(input_row):
    # Extract the preprocessed text from the input row
    preprocessed_text = input_row['preprocessed_text']

    # Initialize CountVectorizer to convert the text into a bag-of-words representation
    vectorizer = CountVectorizer()
    features = vectorizer.fit_transform([preprocessed_text])

    # Use PCC feature selection with k=4 (select top 4 features)
    selector = SelectKBest(f_classif, k=4)
    selected_features = selector.fit_transform(features, [preprocessed_text])

    # Get the indices of the selected features
    feature_indices = selector.get_support(indices=True)

    # Get the names of all features
    selected_feature_names = vectorizer.get_feature_names_out()

    # Filter the selected feature names based on the indices
    selected_feature_names = [selected_feature_names[idx] for idx in feature_indices]

    return selected_feature_names

# Function to read data from a CSV file
def read_data_from_csv(input_filename):
    data = []
    with open(input_filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        for row in reader:
            row_dict = {}
            for i, val in enumerate(row):
                row_dict[header[i]] = val
            data.append(row_dict)
    return data

# Read data from the preprocessed CSV file
input_data = read_data_from_csv('covid_statements_preprocessed.csv')

# Write the selected features to a new CSV file
with open('pcc_features.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Leader', 'pcc_features'])
    for i in range(len(input_data)):
        row = input_data[i]
        selected_feature_names = process_single_data(row)
        writer.writerow([row['Leader'], ' '.join(selected_feature_names)])


  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
  msw = sswn / float(dfwn)
  msb = ssbn / float(dfbn)
 

#OCEAN SCORES


In [None]:
import csv
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')

# Initialize the Sentiment Intensity Analyzer
sid = SentimentIntensityAnalyzer()

# Function to compute OCEAN scores based on sentiment analysis
def compute_ocean_scores(text):
    # Get sentiment scores using VADER sentiment analysis
    sentiment = sid.polarity_scores(text)

    # Compute OCEAN scores based on sentiment scores
    openness_score = round(((sentiment['pos'] + 1 - sentiment['neg']) / 2) * 10, 1)
    conscientiousness_score = round(((sentiment['pos'] + sentiment['neg']) / 2) * 10, 1)
    extraversion_score = round(((sentiment['pos'] + sentiment['neg'] + 1) / 2) * 10, 1)
    agreeableness_score = round(((sentiment['pos'] + 1 - sentiment['neg']) / 2) * 10, 1)
    neuroticism_score = round(((sentiment['neg'] + 1 - sentiment['pos']) / 2) * 10, 1)

    # Return the computed OCEAN scores
    return {
        'openness': openness_score,
        'conscientiousness': conscientiousness_score,
        'extraversion': extraversion_score,
        'agreeableness': agreeableness_score,
        'neuroticism': neuroticism_score
    }

# Function to read data from a CSV file
def read_data_from_csv(mypersonality_final):
    data = []
    with open(mypersonality_final, 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip the header row
        for row in reader:
            data.append(row[1])  # Assuming the text is in the second column (index 1)
    return data

# Function to write OCEAN scores to a new CSV file
def write_output_to_csv(mypersonality_final, output_file):
    with open(mypersonality_final, 'r') as input_csvfile, open(output_file, 'w', newline='') as output_csvfile:
        reader = csv.reader(input_csvfile)
        writer = csv.writer(output_csvfile)

        # Write the header row with OCEAN score columns
        writer.writerow(next(reader) + ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism'])

        # Process each row in the input CSV and write OCEAN scores to the output CSV
        for row in reader:
            text = row[1]  # Assuming the text is in the second column (index 1)
            ocean_scores = compute_ocean_scores(text)
            writer.writerow(row + [ocean_scores['openness'], ocean_scores['conscientiousness'], ocean_scores['extraversion'], ocean_scores['agreeableness'], ocean_scores['neuroticism']])

# Read data from the CSV file
dataset = read_data_from_csv('information_gain.csv')

# Print the number of rows in the dataset
print(len(dataset))

# Loop through each row in the dataset and compute OCEAN scores (currently, the computed scores are not used or stored)
for i in range(len(dataset)):
    compute_ocean_scores(dataset[i])

# Write OCEAN scores to a new CSV file
write_output_to_csv('information_gain.csv', 'IG_OCEAN_Scores.csv')


24


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
import csv
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')

# Initialize the Sentiment Intensity Analyzer
sid = SentimentIntensityAnalyzer()

# Function to compute OCEAN scores based on sentiment analysis
def compute_ocean_scores(text):
    # Get sentiment scores using VADER sentiment analysis
    sentiment = sid.polarity_scores(text)

    # Compute OCEAN scores based on sentiment scores
    openness_score = round(((sentiment['pos'] + 1 - sentiment['neg']) / 2) * 10, 1)
    conscientiousness_score = round(((sentiment['pos'] + sentiment['neg']) / 2) * 10, 1)
    extraversion_score = round(((sentiment['pos'] + sentiment['neg'] + 1) / 2) * 10, 1)
    agreeableness_score = round(((sentiment['pos'] + 1 - sentiment['neg']) / 2) * 10, 1)
    neuroticism_score = round(((sentiment['neg'] + 1 - sentiment['pos']) / 2) * 10, 1)

    # Return the computed OCEAN scores
    return {
        'openness': openness_score,
        'conscientiousness': conscientiousness_score,
        'extraversion': extraversion_score,
        'agreeableness': agreeableness_score,
        'neuroticism': neuroticism_score
    }

# Function to read data from a CSV file
def read_data_from_csv(mypersonality_final):
    data = []
    with open(mypersonality_final, 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip the header row
        for row in reader:
            data.append(row[1])  # Assuming the text is in the second column (index 1)
    return data

# Function to write OCEAN scores to a new CSV file
def write_output_to_csv(mypersonality_final, output_file):
    with open(mypersonality_final, 'r') as input_csvfile, open(output_file, 'w', newline='') as output_csvfile:
        reader = csv.reader(input_csvfile)
        writer = csv.writer(output_csvfile)

        # Write the header row with OCEAN score columns
        writer.writerow(next(reader) + ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism'])

        # Process each row in the input CSV and write OCEAN scores to the output CSV
        for row in reader:
            text = row[1]  # Assuming the text is in the second column (index 1)
            ocean_scores = compute_ocean_scores(text)
            writer.writerow(row + [ocean_scores['openness'], ocean_scores['conscientiousness'], ocean_scores['extraversion'], ocean_scores['agreeableness'], ocean_scores['neuroticism']])

# Read data from the CSV file
dataset = read_data_from_csv('chi_squared.csv')

# Print the number of rows in the dataset
print(len(dataset))

# Loop through each row in the dataset and compute OCEAN scores (currently, the computed scores are not used or stored)
for i in range(len(dataset)):
    compute_ocean_scores(dataset[i])

# Write OCEAN scores to a new CSV file
write_output_to_csv('chi_squared.csv', 'CS_OCEAN_Scores.csv')


24


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
import csv
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')

# Initialize the Sentiment Intensity Analyzer
sid = SentimentIntensityAnalyzer()

# Function to compute OCEAN scores based on sentiment analysis
def compute_ocean_scores(text):
    # Get sentiment scores using VADER sentiment analysis
    sentiment = sid.polarity_scores(text)

    # Compute OCEAN scores based on sentiment scores
    openness_score = round(((sentiment['pos'] + 1 - sentiment['neg']) / 2) * 10, 1)
    conscientiousness_score = round(((sentiment['pos'] + sentiment['neg']) / 2) * 10, 1)
    extraversion_score = round(((sentiment['pos'] + sentiment['neg'] + 1) / 2) * 10, 1)
    agreeableness_score = round(((sentiment['pos'] + 1 - sentiment['neg']) / 2) * 10, 1)
    neuroticism_score = round(((sentiment['neg'] + 1 - sentiment['pos']) / 2) * 10, 1)

    # Return the computed OCEAN scores
    return {
        'openness': openness_score,
        'conscientiousness': conscientiousness_score,
        'extraversion': extraversion_score,
        'agreeableness': agreeableness_score,
        'neuroticism': neuroticism_score
    }

# Function to read data from a CSV file
def read_data_from_csv(mypersonality_final):
    data = []
    with open(mypersonality_final, 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip the header row
        for row in reader:
            data.append(row[1])  # Assuming the text is in the second column (index 1)
    return data

# Function to write OCEAN scores to a new CSV file
def write_output_to_csv(mypersonality_final, output_file):
    with open(mypersonality_final, 'r') as input_csvfile, open(output_file, 'w', newline='') as output_csvfile:
        reader = csv.reader(input_csvfile)
        writer = csv.writer(output_csvfile)

        # Write the header row with OCEAN score columns
        writer.writerow(next(reader) + ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism'])

        # Process each row in the input CSV and write OCEAN scores to the output CSV
        for row in reader:
            text = row[1]  # Assuming the text is in the second column (index 1)
            ocean_scores = compute_ocean_scores(text)
            writer.writerow(row + [ocean_scores['openness'], ocean_scores['conscientiousness'], ocean_scores['extraversion'], ocean_scores['agreeableness'], ocean_scores['neuroticism']])

# Read data from the CSV file
dataset = read_data_from_csv('cfs.csv')

# Print the number of rows in the dataset
print(len(dataset))

# Loop through each row in the dataset and compute OCEAN scores (currently, the computed scores are not used or stored)
for i in range(len(dataset)):
    compute_ocean_scores(dataset[i])

# Write OCEAN scores to a new CSV file
write_output_to_csv('cfs.csv', 'CFS_OCEAN_Scores.csv')


24


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
import csv
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')

# Initialize the Sentiment Intensity Analyzer
sid = SentimentIntensityAnalyzer()

# Function to compute OCEAN scores based on sentiment analysis
def compute_ocean_scores(text):
    # Get sentiment scores using VADER sentiment analysis
    sentiment = sid.polarity_scores(text)

    # Compute OCEAN scores based on sentiment scores
    openness_score = round(((sentiment['pos'] + 1 - sentiment['neg']) / 2) * 10, 1)
    conscientiousness_score = round(((sentiment['pos'] + sentiment['neg']) / 2) * 10, 1)
    extraversion_score = round(((sentiment['pos'] + sentiment['neg'] + 1) / 2) * 10, 1)
    agreeableness_score = round(((sentiment['pos'] + 1 - sentiment['neg']) / 2) * 10, 1)
    neuroticism_score = round(((sentiment['neg'] + 1 - sentiment['pos']) / 2) * 10, 1)

    # Return the computed OCEAN scores
    return {
        'openness': openness_score,
        'conscientiousness': conscientiousness_score,
        'extraversion': extraversion_score,
        'agreeableness': agreeableness_score,
        'neuroticism': neuroticism_score
    }

# Function to read data from a CSV file
def read_data_from_csv(mypersonality_final):
    data = []
    with open(mypersonality_final, 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip the header row
        for row in reader:
            data.append(row[1])  # Assuming the text is in the second column (index 1)
    return data

# Function to write OCEAN scores to a new CSV file
def write_output_to_csv(mypersonality_final, output_file):
    with open(mypersonality_final, 'r') as input_csvfile, open(output_file, 'w', newline='') as output_csvfile:
        reader = csv.reader(input_csvfile)
        writer = csv.writer(output_csvfile)

        # Write the header row with OCEAN score columns
        writer.writerow(next(reader) + ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism'])

        # Process each row in the input CSV and write OCEAN scores to the output CSV
        for row in reader:
            text = row[1]  # Assuming the text is in the second column (index 1)
            ocean_scores = compute_ocean_scores(text)
            writer.writerow(row + [ocean_scores['openness'], ocean_scores['conscientiousness'], ocean_scores['extraversion'], ocean_scores['agreeableness'], ocean_scores['neuroticism']])

# Read data from the CSV file
dataset = read_data_from_csv('pcc_features.csv')

# Print the number of rows in the dataset
print(len(dataset))

# Loop through each row in the dataset and compute OCEAN scores (currently, the computed scores are not used or stored)
for i in range(len(dataset)):
    compute_ocean_scores(dataset[i])

# Write OCEAN scores to a new CSV file
write_output_to_csv('pcc_features.csv', 'PCC_OCEAN_Scores.csv')


24


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


#PERSONALITY prediction

In the step, we will be training the model with personality scores and then implement them with the features which are selected through the feature selection algorithsm

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the training data
data = pd.read_csv('train.csv')

# Load the data with OCEAN scores obtained from information gain feature selection
daata = pd.read_csv('IG_OCEAN_Scores.csv')

# Extract features (X) and target variable (y) from the training data
X_train = data[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']]
y_train = data['personality']

# Extract features (X_test) from the data with OCEAN scores
y_test = daata[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']]

# Initialize a Support Vector Machine (SVM) classifier with a polynomial kernel
clf = SVC(kernel='poly')

# Train the SVM classifier on the training data
clf.fit(X_train, y_train)

# Predict personality labels for the data with OCEAN scores
y_pred = clf.predict(y_test)

# Add the predicted personality labels to the data with OCEAN scores
daata['personality'] = y_pred

# Save the updated data with predicted personality labels to a new CSV file
daata.to_csv('IG_OCEAN_Scores.csv', index=False)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the training data
data = pd.read_csv('train.csv')

# Load the data with OCEAN scores obtained from chi-squared feature selection
daata = pd.read_csv('CS_OCEAN_Scores.csv')

# Extract features (X) and target variable (y) from the training data
X_train = data[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']]
y_train = data['personality']

# Extract features (X_test) from the data with OCEAN scores
y_test = daata[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']]

# Initialize a Support Vector Machine (SVM) classifier with a radial basis function (RBF) kernel
clf = SVC(kernel='rbf')

# Train the SVM classifier on the training data
clf.fit(X_train, y_train)

# Predict personality labels for the data with OCEAN scores
y_pred = clf.predict(y_test)

# Add the predicted personality labels to the data with OCEAN scores
daata['personality'] = y_pred

# Save the updated data with predicted personality labels to a new CSV file
daata.to_csv('CS_OCEAN_Scores.csv', index=False)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the training data
data = pd.read_csv('train.csv')

# Load the data with OCEAN scores obtained from CFS feature selection
daata = pd.read_csv('CFS_OCEAN_Scores.csv')

# Extract features (X) and target variable (y) from the training data
X_train = data[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']]
y_train = data['personality']

# Extract features (X_test) from the data with OCEAN scores
y_test = daata[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']]

# Initialize a Support Vector Machine (SVM) classifier with a polynomial kernel
clf = SVC(kernel='poly')

# Train the SVM classifier on the training data
clf.fit(X_train, y_train)

# Predict personality labels for the data with OCEAN scores
y_pred = clf.predict(y_test)

# Add the predicted personality labels to the data with OCEAN scores
daata['personality'] = y_pred

# Save the updated data with predicted personality labels to a new CSV file
daata.to_csv('CFS_OCEAN_Scores.csv', index=False)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the training data
data = pd.read_csv('train.csv')

# Load the data with OCEAN scores obtained from PCC feature selection
daata = pd.read_csv('PCC_OCEAN_Scores.csv')

# Extract features (X) and target variable (y) from the training data
X_train = data[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']]
y_train = data['personality']

# Extract features (X_test) from the data with OCEAN scores
y_test = daata[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']]

# Initialize a Support Vector Machine (SVM) classifier with a polynomial kernel
clf = SVC(kernel='poly')

# Train the SVM classifier on the training data
clf.fit(X_train, y_train)

# Predict personality labels for the data with OCEAN scores
y_pred = clf.predict(y_test)

# Add the predicted personality labels to the data with OCEAN scores
daata['personality'] = y_pred

# Save the updated data with predicted personality labels to a new CSV file
daata.to_csv('PCC_OCEAN_Scores.csv', index=False)


#Classifiers

In this stage, we will be implementing the machine learning classifiers on the feature selection algorithms

pcc-svm

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the data with OCEAN scores obtained from PCC feature selection
data = pd.read_csv('PCC_OCEAN_Scores.csv')

# Extract features (X) and target variable (y) from the data
X = data[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']]
y = data['personality']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Initialize a Support Vector Machine (SVM) classifier with a radial basis function (RBF) kernel
clf = SVC(kernel='rbf')

# Train the SVM classifier on the training data
clf.fit(X_train, y_train)

# Predict personality labels for the test data
y_pred = clf.predict(X_test)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
accuracy_percentage = accuracy * 100

# Print the accuracy of the model
print("Accuracy: {:.2f}%".format(accuracy_percentage))


Accuracy: 83.33%


CHI-SQUARED TEST --Decision Tree


In [None]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

  # Load the dataset
data = pd.read_csv('CS_OCEAN_Scores.csv')
X = data[['openness','conscientiousness','extraversion','agreeableness','neuroticism']]
y = data['personality']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.45,random_state=42)

# Train the decision tree classifier
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

# Test the classifier on the test set
y_pred = clf.predict(X_test)

# Print the accuracy of the classifier
accuracy1 = accuracy_score(y_test, y_pred)
accuracy1 = accuracy1 * 100
print("Accuracy: {:.2f}%".format(accuracy1))

Accuracy: 81.82%


CFS---Naive Bayes

In [None]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load the data with OCEAN scores obtained from CFS feature selection
data = pd.read_csv('CFS_OCEAN_Scores.csv')

# Extract features (X) and target variable (y) from the data
X = data[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']]
y = data['personality']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=42)

# Initialize a Multinomial Naive Bayes classifier
clf = MultinomialNB()

# Train the Naive Bayes classifier on the training data
clf.fit(X_train, y_train)

# Predict personality labels for the test data
y_pred = clf.predict(X_test)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
accuracy_percentage = accuracy * 100

# Print the accuracy of the model
print("Naive Bayes Accuracy (CFS): {:.2f}%".format(accuracy_percentage))


Naive Basis Accuracy CFS: 72.73%


IG---Random Forest

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load the data with OCEAN scores obtained from Information Gain feature selection
data = pd.read_csv('IG_OCEAN_Scores.csv')

# Extract features (X) and target variable (y) from the data
X = data[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']]
y = data['personality']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.57, random_state=42)

# Initialize a Random Forest classifier with a fixed random state for reproducibility
clf = RandomForestClassifier(random_state=42)

# Train the Random Forest classifier on the training data
clf.fit(X_train, y_train)

# Predict personality labels for the test data
y_pred = clf.predict(X_test)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
accuracy_percentage = accuracy * 100

# Print the accuracy of the Random Forest model
print("Random Forest Accuracy: {:.2f}%".format(accuracy_percentage))


Random Forest Accuracy: 92.86%
