In [None]:
#############################################
# File: crypto_sentiment_analysis.py
#############################################

"""
Cryptocurrency Sentiment Analysis System
----------------------------------------
Demonstrates a pipeline to fetch or simulate cryptocurrency-related sentiments,
prepare the data, and train/evaluate an NLP model to predict sentiment labels.

Before proceeding, make sure to install the required packages:
    pip install transformers torch sklearn pandas
"""

# 1. Install necessary packages if needed (uncomment if you're in an environment like Google Colab)
# !pip install transformers torch scikit-learn pandas

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

##########################################
# Sample / Synthetic Dataset Creation
##########################################
data = {
    "tweet": [
        "Bitcoin is soaring to new heights! #crypto",
        "I'm uncertain about Ethereum's short-term trends.",
        "Solana's network issues worry me a lot.",
        "DOGE is making a strong comeback!",
        "I think Cardano is overhyped.",
        "The market seems stable today.",
        "Totally disappointed by recent Polkadot performance.",
        "Impressed by how fast Shiba Inu is growing!",
    ],
    "sentiment": [
        "Positive", "Neutral", "Negative", "Positive",
        "Negative", "Neutral", "Negative", "Positive"
    ]
}

crypto_file = "crypto_tweets.csv"
if not os.path.exists(crypto_file):
    df = pd.DataFrame(data)
    df.to_csv(crypto_file, index=False)

# 2. Load dataset
df_crypto = pd.read_csv(crypto_file)

X = df_crypto["tweet"].tolist()
y = df_crypto["sentiment"].tolist()

# Label encoding
label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
y_encoded = [label_map[s] for s in y]

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# 4. Tokenizer and Model Initialization
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

##########################################
# Custom Dataset Class
##########################################
class CryptoDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(item,
                             truncation=True,
                             padding='max_length',
                             max_length=64,
                             return_tensors='pt')
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

train_dataset = CryptoDataset(X_train, y_train)
test_dataset = CryptoDataset(X_test, y_test)

# 5. Training Arguments
training_args = TrainingArguments(
    output_dir="crypto_sentiment_model",
    num_train_epochs=1,  # For demonstration, set low; in production use more epochs
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

##########################################
# Trainer
##########################################
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# 6. Train the Model
trainer.train()

# 7. Evaluate on Test Set
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

print("\nClassification Report (Test Set):")
print(classification_report(y_test, pred_labels, target_names=label_map.keys()))

##########################################
# Inference on New Texts
##########################################
new_texts = [
    "I am super bullish on Bitcoin!",
    "Not sure if this crypto dip will last.",
    "Frustrated with the market crash!"
]

model_inputs = tokenizer(new_texts, truncation=True, padding='max_length', max_length=64, return_tensors='pt')
with torch.no_grad():
    outputs = model(**model_inputs)
logits = outputs.logits
predictions_new = torch.argmax(logits, dim=-1).cpu().numpy()

inv_label_map = {v: k for k, v in label_map.items()}
for text, pred_idx in zip(new_texts, predictions_new):
    print(f"Text: {text}\nPredicted Sentiment: {inv_label_map[pred_idx]}\n")

# Created/Modified files during execution:
print(crypto_file)