In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("Data/twcs.csv")

# Display the shape and the first few rows
print("Shape of dataset:", df.shape)
df.head()


Shape of dataset: (2811774, 7)


Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [2]:
print("Column names:", df.columns.tolist())
print("\nData types:\n", df.dtypes)


Column names: ['tweet_id', 'author_id', 'inbound', 'created_at', 'text', 'response_tweet_id', 'in_response_to_tweet_id']

Data types:
 tweet_id                     int64
author_id                   object
inbound                       bool
created_at                  object
text                        object
response_tweet_id           object
in_response_to_tweet_id    float64
dtype: object


In [3]:
# Check for missing values
print(df.isnull().sum())

# See distribution of inbound vs outbound messages
print(df['inbound'].value_counts())

# Example inbound message (customer)
print("\nCustomer message:\n", df[df['inbound'] == True]['text'].iloc[0])

# Example outbound message (support response)
print("\nSupport message:\n", df[df['inbound'] == False]['text'].iloc[0])


tweet_id                         0
author_id                        0
inbound                          0
created_at                       0
text                             0
response_tweet_id          1040629
in_response_to_tweet_id     794335
dtype: int64
inbound
True     1537843
False    1273931
Name: count, dtype: int64

Customer message:
 @sprintcare and how do you propose we do that

Support message:
 @115712 I understand. I would like to assist you. We would need to get you into a private secured link to further assist.


In [4]:
# Sort by author and time to keep conversation order
df_sorted = df.sort_values(by=['author_id', 'created_at'])


In [5]:
# Store pairs here
qa_pairs = []

# Flag to hold current customer message
current_question = None

for _, row in df_sorted.iterrows():
    if row['inbound']:  # It's a customer message
        current_question = row['text']
    elif not row['inbound'] and current_question:  # It's a support message after customer
        answer = row['text']
        qa_pairs.append((current_question, answer))
        current_question = None  # Reset after pairing


In [6]:
qa_df = pd.DataFrame(qa_pairs, columns=['question', 'answer'])
print("Total QA pairs:", len(qa_df))
qa_df.head()


Total QA pairs: 1


Unnamed: 0,question,answer
0,"@ATT ...and all these hours later, still no re...",@224279 @117070 @3146 @3730 Thanks for joining...


In [7]:
!pip install transformers datasets


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp38-cp38-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py38-none-any.whl.metadata (7.1 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
Downloading multiprocess-0.70.16-py38-none-any.whl (132 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Downloading xxhash-3.5.0-cp38-cp38-win_amd64.whl (30 kB)
Installing co

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fbprophet 0.7.1 requires cmdstanpy==0.9.5, but you have cmdstanpy 1.2.5 which is incompatible.


In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
tokenizer.pad_token = tokenizer.eos_token

# Sample customer input
input_text = "@ATT ...and all these hours later, still no response from you or @117735 for that matter. This is service failure number two. Will there be time for a third?"

# Encode input and generate attention mask
inputs = tokenizer(input_text + tokenizer.eos_token, return_tensors='pt', padding=True)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Generate response with attention_mask
chat_history_ids = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=1000,
    pad_token_id=tokenizer.eos_token_id
)

# Decode reply
reply = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
print(f"Customer: {input_text}\nSupport Bot: {reply}")


Customer: @ATT ...and all these hours later, still no response from you or @117735 for that matter. This is service failure number two. Will there be time for a third?
Support Bot: I'm sorry, I'm not sure what you mean by this.


In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load model and tokenizer
model_name = "microsoft/DialoGPT-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set pad token if needed
tokenizer.pad_token = tokenizer.eos_token

# Initialize conversation history
chat_history_ids = None

print("🤖 DialoGPT Customer Support Chatbot (type 'quit' to stop)")
while True:
    # User input
    user_input = input("Customer: ")

    if user_input.lower() in ['quit', 'exit']:
        print("👋 Goodbye!")
        break

    # Tokenize and encode user input, append to history if it exists
    new_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')

    # If this is not the first message, concatenate with chat history
    bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1) if chat_history_ids is not None else new_input_ids

    # Generate response
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=1000,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )

    # Decode and print response (only the new part)
    response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    print(f"Support Bot: {response}")


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

🤖 DialoGPT Customer Support Chatbot (type 'quit' to stop)
Support Bot: 
Support Bot: R
Support Bot: 
👋 Goodbye!
