In [1]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import umap.umap_ as umap

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, OPTICS
from sklearn.mixture import GaussianMixture

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score, silhouette_score
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [122]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, matthews_corrcoef, cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from scipy.stats import uniform, loguniform
from sklearn.naive_bayes import MultinomialNB

In [2]:
# For handling warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\utgoy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utgoy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utgoy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utgoy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Download the data

In [5]:
train_data = pd.read_csv('./data/train.csv')#.dropna()
val_data = pd.read_csv('./data/val.csv')#.dropna()
test_data = pd.read_csv('./data/test.csv')#.dropna()

In [6]:
# get all train data (labelled and unlabelled)
X_train    = train_data['Phrase']
y_train    = train_data['Sentiment']

# get only labelled train data
mask = (y_train != -100)
train_data_clean    = train_data[mask]
X_train_clean    = X_train[mask]
y_train_clean    = y_train[mask]

# get val data
X_val    = val_data['Phrase']
y_val    = val_data['Sentiment']

# get test data
X_test     = test_data['Phrase']

print(f"Train Data Shape: {X_train.shape}")
print(f"Cleaned Train Data Shape: {train_data_clean['Phrase'].shape}")
print(f"Validation Data Shape: {X_val.shape}")
print(f"Test Data Shape: {X_test.shape}")

print(" ")
print(f"Number of labels = 0 in train dataset as percentage: {((y_train == 0).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 1 in train dataset as percentage: {((y_train == 1).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 2 in train dataset as percentage: {((y_train == 2).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 3 in train dataset as percentage: {((y_train == 3).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 4 in train dataset as percentage: {((y_train == 4).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = -100 in train dataset as percentage: {((y_train == -100).sum() / (X_train.shape[0])) * 100:0.2f}%")

print(" ")
print(f"Number of labels = 0 in val dataset as percentage: {((y_val == 0).sum() / (X_val.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 1 in val dataset as percentage: {((y_val == 1).sum() / (X_val.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 2 in val dataset as percentage: {((y_val == 2).sum() / (X_val.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 3 in val dataset as percentage: {((y_val == 3).sum() / (X_val.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 4 in val dataset as percentage: {((y_val == 4).sum() / (X_val.shape[0])) * 100:0.2f}%")
print(f"Number of labels = -100 in val dataset as percentage: {((y_val == -100).sum() / (X_val.shape[0])) * 100:0.2f}%")

Train Data Shape: (59706,)
Cleaned Train Data Shape: (24758,)
Validation Data Shape: (23256,)
Test Data Shape: (23257,)
 
Number of labels = 0 in train dataset as percentage: 8.33%
Number of labels = 1 in train dataset as percentage: 8.95%
Number of labels = 2 in train dataset as percentage: 5.33%
Number of labels = 3 in train dataset as percentage: 9.60%
Number of labels = 4 in train dataset as percentage: 9.26%
Number of labels = -100 in train dataset as percentage: 58.53%
 
Number of labels = 0 in val dataset as percentage: 19.63%
Number of labels = 1 in val dataset as percentage: 20.27%
Number of labels = 2 in val dataset as percentage: 20.42%
Number of labels = 3 in val dataset as percentage: 19.81%
Number of labels = 4 in val dataset as percentage: 19.88%
Number of labels = -100 in val dataset as percentage: 0.00%


# Vectorization

## Define Preprocessing Helper Functions

In [9]:
def clean(text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    #added substitutions

    #***********added substitutions***********
    # remove all the special characters
    texter = re.sub(r'\W', ' ', texter)
    # remove all single characters
    texter = re.sub(r'\s+[a-zA-Z]\s+', ' ', texter)
    # Remove single characters from the start
    texter = re.sub(r'\^[a-zA-Z]\s+', ' ', texter)
    # Remove numbers
    texter = re.sub(r'\d+', ' ', texter)
    # Converting to Lowercase
    texter = texter.lower()
    # Remove punctuation
    texter = re.sub(r'[^\w\s]', ' ', texter)
    # Remove parentheses
    texter = re.sub(r'\([^)]*\)', ' ', texter)
    # Remove single quotes
    texter = re.sub(r'\'', ' ', texter)
    # Substituting multiple spaces with single space
    texter = re.sub(r'\s+', ' ', texter, flags=re.I)

    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

def clean_dataset(dataset):
    for row in range(dataset.shape[0]):
        dataset[row,0] = clean(dataset[row,0])
    return dataset

def tokenize_lexicon(texts):
    return_texts = []
    for i in range(len(texts)):
        return_texts.append(nltk.word_tokenize(texts[i]))
        return_texts[i] = nltk.pos_tag(return_texts[i])
    return return_texts

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wn.ADJ
    elif pos_tag.startswith('V'):
        return wn.VERB
    elif pos_tag.startswith('N'):
        return wn.NOUN
    elif pos_tag.startswith('R'):
        return wn.ADV
    else:
        return wn.NOUN

def lemmatize_texts(texts):
    return_texts = []
    lemmer = nltk.stem.WordNetLemmatizer()
    for i in range(len(texts)):
        return_texts.append([])
        for j in range(len(texts[i])):
                return_texts[i].append(lemmer.lemmatize(texts[i][j][0], pos=get_wordnet_pos(texts[i][j][1])))
    return return_texts

def stem_texts(texts):
    return_texts = []
    ps = PorterStemmer()
    for i in range(len(texts)):
        return_texts.append([])
        for j in range(len(texts[i])):
                return_texts[i].append(ps.stem(texts[i][j][0]))
    return return_texts


def backtostring(texts):
    return_texts = []
    for i in range(len(texts)):
        return_texts.append(" ".join(texts[i]))
    return return_texts

In [10]:
def pre_process(data):
    preproc_data = data.copy()
    preproc_data = preproc_data.str.lower()
    punctuation = string.punctuation
    mapping = str.maketrans("", "", punctuation)
    preproc_data = preproc_data.str.translate(mapping)
    stop_words = set(stopwords.words('english'))
    preproc_data = preproc_data.apply(lambda text: ' '.join([word for word in str(text).split() if word.lower() not in stop_words]))
    # nltk.download('wordnet')
    # lemmatizer = WordNetLemmatizer()
    # preproc_data = preproc_data.apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split()]))

    stemmer = PorterStemmer()
    preproc_data = preproc_data.apply(lambda text: ' '.join([stemmer.stem(word) for word in text.split()]))
    
    preproc_data = preproc_data.apply(lambda text: re.sub(r'@\w+', '', re.sub(r'http\S+|www\S+', '', text)))
    return preproc_data

# get the preprocessed data
X_train_preproc   = pre_process(X_train)
X_train_clean_preproc   = pre_process(X_train_clean)
X_val_preproc = pre_process(X_val)
X_test_preproc = pre_process(X_test)

## TF-IDF

In [173]:
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [175]:
tfidf = TfidfVectorizer(strip_accents='unicode', lowercase=True, tokenizer=tokenizer_porter, stop_words='english')
X_train_preproc_tfidf = tfidf.fit_transform(X_train_preproc)
print(f"\nTF-IDF feature matrix shape: {X_train_preproc_tfidf.shape}")


TF-IDF feature matrix shape: (59706, 10456)


In [176]:
X_val_preproc_tfidf = tfidf.transform(X_val_preproc)
print(f"\nTF-IDF feature matrix shape: {X_val_preproc_tfidf.shape}")


TF-IDF feature matrix shape: (23256, 10456)


# Dimensionality Reduction

## PCA

In [None]:
pca = PCA(n_components=2400, random_state=42)  

X_pca = pca.fit_transform(X_train_preproc_tfidf.toarray())
print(f"PCA-reduced feature matrix shape: {X_pca.shape}")

In [None]:
sample_size = int(59701/5)
k_values = [5, 50, 100, 250, 500, 750, 1000]  # Start from 2, as silhouette score needs at least two clusters
bic_scores_pca = []
aic_scores_pca = []

# bic_scores_umap = []
# aic_scores_umap = []

# silhouette_scores_gmm = []

for k in tqdm(k_values, desc="Finding K"):

    sample_indices = np.random.choice(X_pca.shape[0], size=sample_size, replace=False)
    
    # Fit a Gaussian Mixture Model (GMM) with k components
    gmm = GaussianMixture(n_components=k, covariance_type='diag', max_iter=100, reg_covar=1e-4, init_params='kmeans', random_state=42)
    gmm_labels = gmm.fit_predict(X_pca[sample_indices])

    # Calculate BIC and AIC
    bic_scores_pca.append(gmm.bic(X_pca))
    aic_scores_pca.append(gmm.aic(X_pca))

    # gmm = GaussianMixture(n_components=k, covariance_type='diag', max_iter=100, reg_covar=1e-4, init_params='kmeans', random_state=42)
    # gmm_labels = gmm.fit_predict(X_umap[sample_indices])
    # # Calculate BIC and AIC
    # bic_scores_umap.append(gmm.bic(X_umap))
    # aic_scores_umap.append(gmm.aic(X_umap))

    # print(bic_scores_umap)
    # print(aic_scores_umap)

    # Calculate Silhouette score if k > 1 (Silhouette requires at least 2 clusters)
    # score = silhouette_score(X_pca, gmm_labels)
    # silhouette_scores_gmm.append(score)

# bic_scores_pca = [-10046903.765079908, -11768516.826057164, -12254994.691571359, -12697607.571135331, -12841542.659642098, -12771479.471291933, -12629629.958835967]
# aic_scores_pca = [-10051438.305520937, -11813943.204403894, -12345856.445368867, -12924775.451285176, -13295887.417045837, -13453001.105949566, -13538328.470747493]

# bic_scores_umap = [-679675.1635472373, -5523632.20674975, -7242640.424716241, -9202283.198420955, -10315246.580855194, -11068146.122167017, -11940543.615146823]
# aic_scores_umap = [-684209.7039882655, -5569058.585096479, -7333502.178513749, -9429451.0785708, -10769591.338258933, -11749667.75682465, -12849242.12705835]

# Plot the metrics for GMM
plt.figure(figsize=(14, 5))

# Plot BIC and AIC scores
# plt.subplot(1, 2, 1)
# plt.plot(k_values, bic_scores_umap, marker='o', label='BIC-UMAP')
# plt.plot(k_values, aic_scores_umap, marker='o',  label='AIC-UMAP')
# plt.xticks(k_values)
# plt.xlabel('Number of Components (k)')
# plt.ylabel('Score')
# plt.title('UMAP Scores')
# plt.legend()

# Plot the Silhouette Scores for GMM
plt.subplot(1, 2, 1)
plt.plot(k_values, bic_scores_pca, marker='o', label='BIC-PCA')
plt.plot(k_values, aic_scores_pca, marker='o',  label='AIC-PCA')
plt.xticks(k_values)
plt.xlabel('Number of Components (k)')
plt.ylabel('Score')
plt.title('PCA Scores')
plt.legend()

# Display the plots
plt.tight_layout()
plt.show()

# Regression

In [None]:
def assign_labels_with_clustering(df, X, clustering_model, model_name):
    """
    Assign labels to missing data based on clustering.

    Parameters:
    - df: pandas DataFrame with 'Sentiment' column
    - X: Feature matrix after PCA
    - clustering_model: sklearn clustering model
    - model_name: String name of the clustering model

    Returns:
    - df_combined: DataFrame with assigned labels
    """
    print(f"\n--- {model_name} Clustering ---")
    # Fit the clustering model
    clustering_model.fit(X)
    
    # Obtain cluster labels
    if hasattr(clustering_model, 'labels_'):
        clusters = clustering_model.labels_
    else:
        clusters = clustering_model.predict(X)
    
    df['Cluster'] = clusters
   
    # Debug: Show cluster distribution
    print(f"Number of clusters formed: {len(np.unique(clusters))}")
    # print(f"Cluster distribution:\n{pd.Series(clusters).value_counts()}")
    
    # Map each cluster to the most frequent sentiment label within that cluster
    cluster_label_map = {}
    for cluster in np.unique(clusters):
        # Select rows in the current cluster with known labels
        mask = (df['Cluster'] == cluster) & (df['Sentiment'].notnull())
        if mask.sum() == 0:
            # If no labeled data in cluster, assign the most common overall label
            most_common_label = df['Sentiment'].mode()[0]
            cluster_label_map[cluster] = most_common_label
            print(f"Cluster {cluster}: No labeled data. Assigning most common label {most_common_label}.")
        else:
            # Assign the most common label within the cluster           
            most_common_label = df.loc[mask, 'Sentiment'].mode()[0]
            cluster_label_map[cluster] = most_common_label
            print(f"Cluster {cluster}: Assigning label {most_common_label} based on majority voting.")
    
    # Assign labels to missing data
    def assign_label(row):
        if pd.isnull(row['Sentiment']):
            return cluster_label_map[row['Cluster']]
        else:
            return row['Sentiment']
    
    df['Sentiment_Assigned'] = df.apply(assign_label, axis=1)
    
    # Display the mapping
    # print(f"\nCluster to Label Mapping for {model_name}:")
    # for cluster, label in cluster_label_map.items():
    #     print(f"Cluster {cluster}: Label {label}")
    
    # Debug: Check the number of assigned labels
    print("\nLabel Distribution After Assignment:")
    print(df['Sentiment_Assigned'].value_counts())
    
    # Drop the 'Cluster' column as it's no longer needed
    # df = df.drop('Cluster', axis=1)
    return df

## PCA-GMM

In [None]:
gmm = GaussianMixture(n_components=100, random_state=42)

df_train = train_data.copy()
df_train['Sentiment'] = np.where(df_train['Sentiment']==-100, np.nan, df_train['Sentiment'])

df_gmm = assign_labels_with_clustering(df_train, X_pca, gmm, 'Gaussian Mixture Model')

In [68]:
# df_gmm.to_csv('./data/best_clusters.csv', index=False)

# Hyperparameter Tunning

In [177]:
X_train, y_train, X_val, y_val = X_train_preproc_tfidf, df_gmm['Sentiment_Assigned'], X_val_preproc_tfidf, y_val

In [126]:
# Detailed hyperparameter grids for each model
param_grids = {
    'logistic_regression': {
        'C': loguniform(1e-4, 1e4),
        'penalty': ['l2', 'none'],  # l2 regularization or no regularization
        'solver': ['lbfgs', 'liblinear', 'saga']
    },
    'random_forest': {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 5],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [5, 50],
        'max_features': ['sqrt', 'log2', None],
    },
    'lightgbm': {
        'n_estimators': [100, 200, 500],
        'learning_rate': loguniform(1e-4, 1e-1),
        'num_leaves': [50, 70, 100],
        'max_depth': [10, 20],
        'min_child_samples': [10, 20, 30],
        'subsample': uniform(0.6, 0.4),  # sampling between 0.6 and 1.0
        'colsample_bytree': uniform(0.6, 0.4), 
        
    },
    'xgboost': {
        'n_estimators': [1,5,10,100, 200, 500],
        'learning_rate': loguniform(1e-4, 1e-1),
        'max_depth': [3, 6, 9, 12],
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4),
        'gamma': [0, 0.1, 0.2, 0.3],
        'reg_alpha': [0, 0.1, 0.5, 1],
        'reg_lambda': [1, 1.5, 2, 3]
    },
    # 'multinomial_nb': {
    #     'alpha': loguniform(1e-3, 1e0)  # Smoothing parameter
    }
}

# Initialize models
models = {
    'logistic_regression': LogisticRegression(n_jobs=-1,),
    'random_forest': RandomForestClassifier(random_state=42, n_jobs=-1,),
    'lightgbm': lgb.LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1, verbose_eval=False,),
    'xgboost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1,),
    # 'multinomial_nb': MultinomialNB(n_jobs=-1,)
}

In [None]:
# Store best models and their performance metrics
best_models = {}
metrics = {}

# Hyperparameter tuning using RandomizedSearchCV
for model_name, model in models.items():
    print(f"Tuning {model_name}...")
    # Use RandomizedSearchCV with 20 iterations and 3-fold cross-validation
    random_search = RandomizedSearchCV(model, param_distributions=param_grids[model_name], 
                                       n_iter=20, scoring='accuracy', cv=3, n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)
    best_models[model_name] = random_search.best_estimator_
    print(f"Best parameters for {model_name}: {random_search.best_params_}")

In [None]:
# best_models['logistic_regression'] = {'C': 1.6, 'penalty': 'l2', 'solver': 'liblinear', 'n_jobs':-1}
# best_models['random_forest'] = {'n_jobs':-1,'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': None}
# best_models['lightgbm'] = {'colsample_bytree': 0.839, 'learning_rate': 0.058, 'max_depth': 20, 'min_child_samples': 20, 'n_estimators': 500, 'num_leaves': 50, 'subsample': 0.985, 'n_jobs':-1, 'verbose': -1, 'verbose_eval':False,}
# best_models['xgboost'] = {'n_jobs':-1, 'colsample_bytree': 0.78, 'gamma': 0.1, 'learning_rate': 0.048, 'max_depth': 6, 'n_estimators': 500, 'reg_alpha': 0.5, 'reg_lambda': 3, 'subsample': 0.618}
# best_models['multinomial_nb'] = {'n_jobs':-1, 

# models = {
#     'logistic_regression': LogisticRegression(**best_models['logistic_regression']),
#     'random_forest': RandomForestClassifier(**best_models['random_forest']),
#     'lightgbm': lgb.LGBMClassifier(**best_models['lightgbm']),
#     'xgboost': xgb.XGBClassifier(**best_models['xgboost'])
#     'multinomial_nb': MultinomialNB(**best_models['multinomial_nb'])
# }

# Evaluate each best model on the validation set
for model_name, best_model in models.items():
    print(f"Best parameters for {model_name}: {best_models[model_name]}")
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)
    # y_pred_proba = best_model.predict_proba(X_val)[:, 1] if hasattr(best_model, "predict_proba") else None

    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted')
    mcc = matthews_corrcoef(y_val, y_pred)
    kappa = cohen_kappa_score(y_val, y_pred)

    # roc_auc = roc_auc_score(y_val, y_pred_proba) if y_pred_proba is not None else "N/A"
    
    metrics[model_name] = {
        'accuracy': accuracy,
        'f1_score': f1,
        'matthews_corrcoef':mcc,
        'cohen_kappa':kappa
        # 'roc_auc': roc_auc
    }

In [194]:
# Display the performance of each model
metrics_df = pd.DataFrame(metrics).T.sort_values('accuracy').round(3)
print("\nValidation Performance of Best Models:")
display(metrics_df)


Validation Performance of Best Models:


Unnamed: 0,accuracy,f1_score,matthews_corrcoef,cohen_kappa
xgboost,0.816,0.817,0.772,0.77
lightgbm,0.847,0.847,0.809,0.808
random_forest,0.858,0.858,0.822,0.822
logistic_regression,0.866,0.866,0.833,0.833


# Scratch Pad

In [None]:
from sklearn.decomposition import PCA

# Define the explained variance threshold
explained_variance_threshold = 0.9

# Fit PCA without specifying the number of components
pca = PCA().fit(X_train_preproc_tfidf.toarray())

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components needed to reach the variance threshold
n_components_optimal = np.argmax(cumulative_variance >= explained_variance_threshold) + 1
print(f"Optimal number of PCA components for {explained_variance_threshold * 100}% variance: {n_components_optimal}")

# Reduce the dataset to the optimal number of components
# X_reduced = PCA(n_components=n_components_optimal).fit_transform(X)

In [None]:
# Calculate cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Plot the cumulative explained variance
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')
plt.plot(range(1, len(cumulative_variance) + 1), len(cumulative_variance)*[explained_variance_threshold], marker='_')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by Number of Components')
plt.grid(True)
plt.show()