In [1]:
import os

#data process
import pandas as pd
import numpy as np

#data import
import zipfile  
import shutil
import pandas as pd
#data cleansing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import ssl
import string
from textblob import TextBlob

In [2]:
# Create data directory if it doesn't exist
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Run the Kaggle API command to download the dataset
!kaggle datasets download -d thoughtvector/customer-support-on-twitter -p {data_dir}

# Unzip the downloaded dataset to the data directory
zip_file_path = os.path.join(data_dir, 'customer-support-on-twitter.zip')
extracted_folder_path = os.path.join(data_dir, 'twcs')

# Extract the contents directly into the data folder
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(data_dir)

# Move the contents of the 'twcs' folder to the 'data' folder
for item in os.listdir(extracted_folder_path):
    s = os.path.join(extracted_folder_path, item)
    d = os.path.join(data_dir, item)
    shutil.move(s, d)

# Remove the now-empty 'twcs' folder
os.rmdir(extracted_folder_path)

# Display success message
print("Dataset downloaded and moved to the data folder successfully!")


customer-support-on-twitter.zip: Skipping, found more recently modified local copy (use --force to force download)
Dataset downloaded and moved to the data folder successfully!


In [3]:
# Specify the size of the random sample
sample_size = 1000  # Adjust the sample size as needed
random_seed = 42  # Set a random seed for reproducibility

# Load a random sample of the dataset into a Pandas DataFrame

dataset_path = 'data/twcs.csv'
df_full = pd.read_csv(dataset_path)

# Take a random sample of the dataset
np.random.seed(random_seed)
random_indices = np.random.choice(df_full.index, size=sample_size, replace=False)
df = df_full.loc[random_indices]

# Display the first few rows of the DataFrame to understand the structure
print("First few rows of the dataset:")
print(df.head())

# Save the first few rows to a sample CSV file
sample_path = 'data/sample_dataset.csv'
df.head().to_csv(sample_path, index=False)

# Display general information about the dataset
print("\nDataset info:")
print(df.info())

# Display basic statistics of numerical columns
print("\nSummary statistics:")
print(df.describe())

# Display the number of rows and columns in the dataset
rows, cols = df.shape
print(f"\nNumber of rows: {rows}, Number of columns: {cols}")

First few rows of the dataset:
         tweet_id     author_id  inbound                      created_at  \
160535     192624        161253     True  Wed Oct 04 13:59:33 +0000 2017   
659248     738238        296574     True  Fri Oct 06 18:29:06 +0000 2017   
2250310   2414302  AppleSupport    False  Tue Nov 14 17:38:01 +0000 2017   
1640680   1793929        539096     True  Thu Oct 12 06:04:41 +0000 2017   
1933623   2088018        617376     True  Mon Nov 06 20:30:49 +0000 2017   

                                                      text response_tweet_id  \
160535   @161252 What's that egg website people talk about            192623   
659248   Why!🤷🏻‍♀️ #iOS11 @AppleSupport https://t.co/BX...            738237   
2250310  @693975 We can assist you. We recommend updati...           2414303   
1640680  @331912 @115955 Thats better than having an un...           1793928   
1933623  @VirginAmerica is probably one of the best air...           2088017   

         in_response_to_tweet_i

In [4]:
# Display information about the DataFrame
print("DataFrame Information:")
print(df.info())

DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 160535 to 392446
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tweet_id                 1000 non-null   int64  
 1   author_id                1000 non-null   object 
 2   inbound                  1000 non-null   bool   
 3   created_at               1000 non-null   object 
 4   text                     1000 non-null   object 
 5   response_tweet_id        643 non-null    object 
 6   in_response_to_tweet_id  720 non-null    float64
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 55.7+ KB
None


In [5]:
#Display the first few rows of the DataFrame
print("\nOriginal Text:")
print(df['text'].head().to_string(index=False))


Original Text:
 @161252 What's that egg website people talk about
Why!🤷🏻‍♀️ #iOS11 @AppleSupport https://t.co/BXr...
@693975 We can assist you. We recommend updatin...
@331912 @115955 Thats better than having an uns...
@VirginAmerica is probably one of the best airl...


In [6]:
# Display a separator
print("\n" + "="*80 + "\n")





In [7]:
# Specify the path to NLTK data
nltk.data.path.append("/Users/stak/Dev/Customer_Feedback_LLM/data/nltk_data")

In [8]:
# Function to preprocess text data
def preprocess_text(text):
    # Remove links and unnecessary characters
    text = ' '.join(word for word in text.split() if not word.startswith('http') and word not in string.punctuation)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text into individual words
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Debugging statements
    #print("\nOriginal Text:")
    #print(text)
    #print("\nTokens After Lowercasing and Tokenization:")
    #print(tokens)
    #print("\nTokens After Removing Stop Words:")
    #print(tokens)
    
    return ' '.join(tokens)

# Apply the preprocessing function to the 'text' column
df['preprocessed_text'] = df['text'].apply(preprocess_text)

# Display the first few rows of the DataFrame with the preprocessed text
#print("Preprocessed Text:")
#print(df['preprocessed_text'].sample(5).to_string(index=False))

In [10]:
# Function to get sentiment polarity
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Apply the sentiment analysis function to the 'preprocessed_text' column
df['sentiment_polarity'] = df['preprocessed_text'].apply(get_sentiment)

# Display the first few rows with sentiment polarity
#print("\nSentiment Polarity:")
#print(df[['preprocessed_text', 'sentiment_polarity']].head(5))


In [11]:
# Display information about the DataFrame
print("DataFrame Information:")
print(df.info())

DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 160535 to 392446
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tweet_id                 1000 non-null   int64  
 1   author_id                1000 non-null   object 
 2   inbound                  1000 non-null   bool   
 3   created_at               1000 non-null   object 
 4   text                     1000 non-null   object 
 5   response_tweet_id        643 non-null    object 
 6   in_response_to_tweet_id  720 non-null    float64
 7   preprocessed_text        1000 non-null   object 
 8   sentiment_polarity       1000 non-null   float64
dtypes: bool(1), float64(2), int64(1), object(5)
memory usage: 71.3+ KB
None
