In [7]:
import pandas as pd

df = pd.read_csv('unfiltered_data.csv')

missing_values = df.isnull().sum()
print("Missing Values Count:\n", missing_values)

missing_label_0 = df[df['label'] == 0]['utterance'].isnull().sum()
missing_label_1 = df[df['label'] == 1]['utterance'].isnull().sum()

print("\nMissing Values with label = 0 (inv):", missing_label_0)
print("Missing Values with label = 1 (chi):", missing_label_1)

#Fill missing values based on the 'label' column
df['utterance'] = df.apply(lambda row: 'inv' if pd.isnull(row['utterance']) and row['label'] == 0 else ('chi' if pd.isnull(row['utterance']) and row['label'] == 1 else row['utterance']), axis=1)
df['pos_tags'] = df.apply(lambda row: 'inv' if pd.isnull(row['pos_tags']) and row['label'] == 0 else ('chi' if pd.isnull(row['pos_tags']) and row['label'] == 1 else row['pos_tags']), axis=1)


Missing Values Count:
 utterance    438
label          0
pos_tags     438
dtype: int64

Missing Values with label = 0 (inv): 290
Missing Values with label = 1 (chi): 148


In [8]:
#saving the new data to csv
file_name='filtered_data.csv'
df.to_csv(file_name, index=False)
print(f"Data saved to {file_name}")


Data saved to filtered_data.csv


In [9]:
# Check for missing (NaN) values in the entire DataFrame

missing_data_summary = df.isnull().sum()

missing_utterances = df['utterance'].isnull().sum()
missing_labels=df['label'].isnull().sum()

print("Missing data summary:\n", missing_data_summary)
print(f"Missing utterances: {missing_utterances}")
print(f"Missing labels: {missing_labels}")


Missing data summary:
 utterance    0
label        0
pos_tags     0
dtype: int64
Missing utterances: 0
Missing labels: 0


In [30]:
import numpy as np
utterances=df['utterance'].tolist()
labels=df['label'].tolist()
pos_tags=df['pos_tags'].tolist()

In [34]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Tokenization and POS tagging
def tokenize_and_tag(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    return pos_tags

# Tag all utterances
def tag_all_utterances(utterances):
    return [tokenize_and_tag(utt) for utt in utterances]

# Extract features
def extract_features(tagged_sentence):
    features = {
        'word_count': len(tagged_sentence),
        'noun_count': sum(1 for word, tag in tagged_sentence if tag.startswith('NN')),
        'verb_count': sum(1 for word, tag in tagged_sentence if tag.startswith('VB')),
        'pronoun_count': sum(1 for word, tag in tagged_sentence if tag.startswith('PRP'))
    }
    return features



[nltk_data] Downloading package punkt to C:\Users\SRI
[nltk_data]     SHIKA.L\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SRI SHIKA.L\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [37]:
def main():
    tagged_data=tag_all_utterances(utterances)
    features = extract_all_features(tagged_data)
    print(features)

In [38]:
if __name__=="__main__":
    main()

[{'word_count': 2, 'noun_count': 2, 'verb_count': 0, 'pronoun_count': 0}, {'word_count': 5, 'noun_count': 2, 'verb_count': 2, 'pronoun_count': 1}, {'word_count': 2, 'noun_count': 1, 'verb_count': 1, 'pronoun_count': 0}, {'word_count': 2, 'noun_count': 2, 'verb_count': 0, 'pronoun_count': 0}, {'word_count': 5, 'noun_count': 1, 'verb_count': 2, 'pronoun_count': 1}, {'word_count': 5, 'noun_count': 1, 'verb_count': 2, 'pronoun_count': 1}, {'word_count': 11, 'noun_count': 2, 'verb_count': 1, 'pronoun_count': 1}, {'word_count': 7, 'noun_count': 1, 'verb_count': 2, 'pronoun_count': 1}, {'word_count': 2, 'noun_count': 1, 'verb_count': 1, 'pronoun_count': 0}, {'word_count': 3, 'noun_count': 1, 'verb_count': 1, 'pronoun_count': 1}, {'word_count': 5, 'noun_count': 2, 'verb_count': 1, 'pronoun_count': 2}, {'word_count': 5, 'noun_count': 1, 'verb_count': 2, 'pronoun_count': 1}, {'word_count': 4, 'noun_count': 2, 'verb_count': 1, 'pronoun_count': 0}, {'word_count': 3, 'noun_count': 1, 'verb_count': 